]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
rds: revert RDS code to 8cbd960 commit to rebase UEK commits
authorMukesh Kacker <mukesh.kacker@oracle.com>
Tue, 7 Jul 2015 23:17:18 +0000 (16:17 -0700)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Tue, 7 Jul 2015 23:37:22 +0000 (16:37 -0700)
reverting net/rds code to following commit:
8cbd960 2009-04-01 RDS: Use spinlock to protect 64b value update on 32b
archs [Andy Grover]

This is common ancestor point for RDS code in Linux kernel repository
and Mellanox OFED-1.5.5 R2 repository.

From this point, we fetch RDS code from Mellanox OFED-1.5.5 repository.

Signed-off-by: Ajaykumar Hotchandani <ajaykumar.hotchandani@oracle.com>
--------------
Revert details:
--------------
Revert "net/rds: RDS-TCP: only initiate reconnect attempt on outgoing TCP socket."
This reverts commit c82ac7e69efe6dbe370d6ba84e2666d7692ef1c2.

Revert "net/rds: RDS-TCP: Always create a new rds_sock for an incoming connection."
This reverts commit f711a6ae062caeee46067b2f2f12ffda319ae73c.

Revert "net/rds: Fix new sparse warning"
This reverts commit e2783717a71e9babfdd7c36c7e35b790d2c01022.

Revert "net/rds: fix unaligned memory access"
This reverts commit c0adf54a10903b59037a4c5fcb933dfeeb7b2624.

Revert "net: Remove iocb argument from sendmsg and recvmsg" for net/rds
This reverts commit 1b784140474e4fc94281a49e96c67d29df0efbde for net/rds.

Revert "RDS: make sure not to loop forever inside rds_send_xmit"
This reverts commit 443be0e5affe3acb6dd81e7402951677e0a0eb35.

Revert "RDS: only use passive connections when addresses match"
This reverts commit 1789b2c077f6d6c82b04cfe49a0fec020dc42488.

Revert "rds: avoid potential stack overflow"
This reverts commit f862e07cf95d5b62a5fc5e981dd7d0dbaf33a501.

Revert "rds: rds_cong_queue_updates needs to defer the congestion update transmission"
This reverts commit 80ad0d4a7a75158f2824d541e4802c88aba4f063.

Revert "rds: Make rds_message_copy_from_user() return 0 on success."
This reverts commit d0a47d32724bf0765b8768086ef1a7a6d074a7a0.

Revert "net: rds: Remove repeated function names from debug output"
This reverts commit 11ac11999bae3c353f86b6e7dd0e43d4a0eada12.

Revert "net: rds: use correct size for max unacked packets and bytes"
This reverts commit db27ebb111e9f69efece08e4cb6a34ff980f8896.

Revert "rds: Fix min() warning in rds_message_inc_copy_to_user()"
This reverts commit 6ff4a8ad4b6eae5171754fb60418bc81834aa09b.

Revert "net: introduce helper macro for_each_cmsghdr" for net/rds
This reverts commit f95b414edb18de59940dcebbefb49cf25c6d505c for net/rds

Revert "put iov_iter into msghdr" for net/rds
This reverts commit c0371da6047abd261bc483c744dbc7d81a116172 for net/rds

Revert "rds: switch rds_message_copy_from_user() to iov_iter"
This reverts commit 083735f4b01b703184c0e11c2e384b2c60a8aea4.

Revert "rds: switch ->inc_copy_to_user() to passing iov_iter"
This reverts commit c310e72c89926e06138e4881f21e4c8da3e7ef18.

Revert "rds: avoid calling sock_kfree_s() on allocation failure"
This reverts commit dee49f203a7feef5d00c416b7dc7e34a7caba8e1.

Revert "net/rds: fix possible double free on sock tear down"
This reverts commit 593cbb3ec6a3f2424966832727f394b1696d0d72.

Revert "net/rds: do proper house keeping if connection fails in rds_tcp_conn_connect"
This reverts commit eb74cc97b830c1e438dc1d6b049f17bdb2b9aae5.

Revert "net/rds: call rds_conn_drop instead of open code it at rds_connect_complete"
This reverts commit 310886dd5fa3606d9325b10caf7c8ba5e9f9ab03.

Revert "treewide: fix synchronize_rcu() in comments" for net/rds
This reverts commit d7cdb968081727746c8d2fb31b12ea6d1694888e for net/rds.

Revert "net: Replace get_cpu_var through this_cpu_ptr" for net/rds
This reverts commit 903ceff7ca7b4d80c083a80ee5163b74e9fa359f for net/rds.

Revert "rds/tcp_listen: Replace comma with semicolon"
This reverts commit 01728371dc261c876d07e9228a55a096b6d9a1f9.

Revert "RDS/RDMA: Replace comma with semicolon"
This reverts commit cc2afe9fe25d99cbe19eac57c0f8f098b2710b7c.

Revert "net: rds: Use time_after() for time comparison"
This reverts commit 71fd762f2eef6acc848e262ac934fc694b49204e.

Revert "rds: remove the unneed NULL checking"
This reverts commit be7faf7168e831f17b85a96f2f797f504b66cfd7.

Revert "arch: Mass conversion of smp_mb__*()" for net/rds
This reverts commit 4e857c58efeb99393cba5a5d0d8ec7117183137c for net/rds.

Revert "net: Fix use after free by removing length arg from sk_data_ready callbacks." for net/rds
This reverts commit 676d23690fb62b5d51ba5d659935e9f7d9da9f8e for net/rds.

Revert "rds: prevent dereference of a NULL device in rds_iw_laddr_check"
This reverts commit bf39b4247b8799935ea91d90db250ab608a58e50.

Revert "net: add build-time checks for msg->msg_name size" for net/rds
This reverts commit 342dfc306fb32155314dad277f3c3686b83fb9f1 for net/rds.

Revert "net: rds: fix per-cpu helper usage"
This reverts commit c196403b79aa241c3fefb3ee5bb328aa7c5cc860.

Revert "net: replace macros net_random and net_srandom with direct calls to prandom" for net/rds
This reverts commit 63862b5bef7349dd1137e4c70702c67d77565785 for net/rds.

Revert "rds: prevent dereference of a NULL device"
This reverts commit c2349758acf1874e4c2b93fe41d072336f1a31d0.

Revert "rds: prevent BUG_ON triggered on congestion update to loopback"
This reverts commit 18fc25c94eadc52a42c025125af24657a93638c0.

Revert "net: rework recvmsg handler msg_name and msg_namelen logic" for net/rds
This reverts commit f3d3342602f8bcbf37d7c46641cb9bca7618eb1c for net/rds.

Revert "inet: convert inet_ehash_secret and ipv6_hash_secret to net_get_random_once" for net/rds
This reverts commit 1bbdceef1e535add893bf71d7b7ab102e4eb69eb for net/rds.

Revert "ipv4: split inet_ehashfn to hash functions per compilation unit" for net/rds
This reverts commit 65cd8033ff375b68037df61603ee68070dc48578 for net/rds.

Revert "net: misc: Remove extern from function prototypes" for net/rds
This reverts commit c1b1203d65955c179fec617ff17a21273f33a414 for net/rds.

Revert "net: Convert uses of typedef ctl_table to struct ctl_table" for net/rds
This reverts commit fe2c6338fd2c6f383c4d4164262f35c8f3708e1f for net/rds.

Revert "net/rds: zero last byte for strncpy"
This reverts commit 2e85d67690cf3ea3f074a6e872f675226883fe7f.

Revert "rds: simplify a warning message"
This reverts commit 7dac1b514a00817ddb43704068c14ffd8b8fba19.

Revert "rds: limit the size allocated by rds_message_alloc()"
This reverts commit ece6b0a2b25652d684a7ced4ae680a863af041e0.

Revert "hlist: drop the node parameter from iterators" for net/rds
This reverts commit b67bfe0d42cac56c512dd5da4b1b347a23f4b70a for net/rds.

Revert "net/rds: remove depends on CONFIG_EXPERIMENTAL"
This reverts commit e34430eeca6f3bd2308d4b4917cfb3a4367c2073.

Revert "IB/rds: suppress incompatible protocol when version is known"
This reverts commit a49675988c127b5b5876c252e5db2ee0410a10c2.

Revert "IB/rds: Correct ib_api use with gs_dma_address/sg_dma_len"
This reverts commit f2e9bd70327d788011cf787a51ceba5925bbc63a.

Revert "net: rds: use this_cpu_* per-cpu helper"
This reverts commit ae4b46e9d7128d2d76e6857fe0b9fc240e8ac695.

Revert "UAPI: (Scripted) Disintegrate include/linux" for
       include/linux/rds.h and include/uapi/linux/rds.h
This reverts commit 607ca46e97a1b6594b29647d98a32d545c24bdff for
       include/linux/rds.h and include/uapi/linux/rds.h

Revert "RDS: fix rds-ping spinlock recursion"
This reverts commit 5175a5e76bbdf20a614fb47ce7a38f0f39e70226.

Revert "rds: Don't disable BH on BH context"
This reverts commit bfdc587c5af4ff155cf702b972e8fcd66d77d5f2.

Revert "rds: set correct msg_namelen"
This reverts commit 06b6a1cf6e776426766298d055bb3991957d90a7.

Revert "net: Fix (nearly-)kernel-doc comments for various functions" for net/rds
This reverts commit 2c53040f018b6c36a46eec75b9b937aaa5f78e6d for net/rds.

Revert "rds_rdma: don't assume infiniband device is PCI"
This reverts commit a0c6ffbcfe600606b2d913dded4dc6b37b3bbbfd.

Revert "sock: Introduce named constants for sk_reuse" for net/rds
This reverts commit 4a17fd5229c1b6066aa478f6b690f8293ce811a1 for net/rds.

Revert "net: Convert all sysctl registrations to register_net_sysctl" for net/rds
This reverts commit ec8f23ce0f4005b74013d4d122e0d540397a93c9 for net/rds.

Revert "net: Move all of the network sysctls without a namespace into init_net." for net/rds
This reverts commit 5dd3df105b9f6cb7dd2472b59e028d0d1c878ecb for net/rds.

Revert "RDS: use gfp flags from caller in conn_alloc()"
This reverts commit f0229eaaf3f82522e2b16b41b0f45bb84a88d1b0.

Revert "Remove printk from rds_sendmsg"
This reverts commit a6506e1486181975d318344143aca722b2b91621.

Revert "rds: remove the second argument of k[un]map_atomic()"
This reverts commit 6114eab535ab49239e0a6ce08eb9243664aef993.

Revert "rds: Fix typo in iw_recv.c and ib_recv.c"
This reverts commit 5fd5c44d3f27c93685d4a036565245f3cdb8c033.

Revert "rds: Make rds_sock_lock BH rather than IRQ safe."
This reverts commit efc3dbc37412c027e363736b4f4c74ee5e8ecffc.

Revert "RDS: Remove some unused iWARP code"
This reverts commit 5b7bf42e3d47fb16aaf53776ae3eaaf1be247a35.

Revert "rds: drop "select LLIST""
This reverts commit 77c1c7c4bd4751dbf47cdacd0e73e67f0a1ed316.

Revert "treewide: use __printf not __attribute__((format(printf,...)))" for net/rds
This reverts commit b9075fa968a0a4347aef35e235e2995c0e57dddd for net/rds.

Revert "net: Add export.h for EXPORT_SYMBOL/THIS_MODULE to non-modules" for net/rds
This reverts commit bc3b2d7fb9b014d75ebb79ba371a763dbab5e8cf for net/rds.

Revert "net: add moduleparam.h for users of module_param/MODULE_PARM_DESC" for net/rds
This reverts commit d9b9384215e17c68d7b6bd05d6fa409e5d4140d7 for net/rds.

Revert "net: Fix files explicitly needing to include module.h" for net/rds
This reverts commit 3a9a231d977222eea36eae091df2c358e03ac839 for net/rds.

Revert "RDSRDMA: Fix cleanup of rds_iw_mr_pool"
This reverts commit 85a64889492b45f931ddac87ec09d84aa7347ee1.

Revert "net, rds, Replace xlist in net/rds/xlist.h with llist"
This reverts commit 1bc144b62524970c8580f6d97a6df0e71c6ee388.

Revert "net: Convert vmalloc/memset to vzalloc" for net/rds
This reverts commit 3dbd4439837f2cfd2ff302897353f4b1b6263b2a for net/rds.

Revert "notifiers: cpu: move cpu notifiers into cpu.h" for net/rds
This reverts commit 80f1ff97d0a9d92f44d2b2dd9425afa950e58f2b for net/rds.

Revert "net: rds: fix const array syntax"
This reverts commit 3e878b8d54e0fc12df363ee8e4a638c8147aac98.

Revert "net/rds: use prink_ratelimited() instead of printk_ratelimit()"
This reverts commit cb0a60564943db21ed3af975ac3d578cdc80b329.

Revert "net: remove interrupt.h inclusion from netdevice.h" for net/rds
This reverts commit a6b7a407865aab9f849dd99a71072b7cd1175116 for net/rds.

Revert "RDMA/cma: Pass QP type into rdma_create_id()" for net/rds
This reverts commit b26f9b9949013fec31b23c426fc463164ae08891 for net/rds.

Revert "Fix common misspellings" for net/rds
This reverts commit 25985edcedea6396277003854657b5f3cb31a628 for net/rds.

Revert "rds: use little-endian bitops"
This reverts commit e1dc1c81b9d1c823f2a529b9b9cf8bf5dacbce6a.

Revert "rds: stop including asm-generic/bitops/le.h directly"
This reverts commit 12ce22423abacca70bf1dfbcb8543b3e2b74aad4.

Revert "rds: prevent BUG_ON triggering on congestion map updates"
This reverts commit 6094628bfd94323fc1cea05ec2c6affd98c18f7f.

Revert "rds/ib: use system_wq instead of rds_ib_fmr_wq"
This reverts commit c534a107e8fe446202b0fab102abc015c56c0317.

Revert "net: cleanup unused macros in net directory" for net/rds
This reverts commit 441c793a56502638d45d5da2195056d686147370 for net/rds.

Revert "Net: rds: Makefile: Remove deprecated items"
This reverts commit 094f2faaa2c4973e50979158f655a1d31a97ba98.

Revert "rds: Integer overflow in RDS cmsg handling"
This reverts commit 218854af84038d828a32f061858b1902ed2beec6.

Revert "rds: Fix rds message leak in rds_message_map_pages"
This reverts commit aa58163a76a3aef33c7220931543d45d0fe43753.

Revert "rds: Remove kfreed tcp conn from list"
This reverts commit 8200a59f24aeca379660f80658a8c0c343ca5c31.

Revert "rds: Lost locking in loop connection freeing"
This reverts commit 58c490babd4b425310363cbd1f406d7e508f77a5.

Revert "RDS: Let rds_message_alloc_sgs() return NULL"
This reverts commit d139ff0907dac9ef72fb2cf301e345bac3aec42f.

Revert "RDS: Copy rds_iovecs into kernel memory instead of rereading from userspace"
This reverts commit fc8162e3c034af743d8def435fda6396603d321f.

Revert "RDS: Clean up error handling in rds_cmsg_rdma_args"
This reverts commit f4a3fc03c1d73753879fb655b8cd628b29f6706b.

Revert "RDS: Return -EINVAL if rds_rdma_pages returns an error"
This reverts commit a09f69c49b84b161ebd4dd09d3cce1b68297f1d3.

Revert "net: fix rds_iovec page count overflow"
This reverts commit 1b1f693d7ad6d193862dcb1118540a030c5e761f.

Revert "rds: make local functions/variables static"
This reverts commit ff51bf841587c75b58d25ed77263158619784dd3.

Revert "De-pessimize rds_page_copy_user"
This reverts commit 799c10559d60f159ab2232203f222f18fa3c4a5f.

Revert "net: fix a lockdep splat" for net/rds
This reverts commit f064af1e500a2bf4607706f0f458163bdb2a6ea5 for net/rds.

Revert "rds: spin_lock_irq() is not nestable"
This reverts commit aef3ea33e85035f7c827c1db9155f97f4b7ee725.

Revert "rds: double unlock in rds_ib_cm_handle_connect()"
This reverts commit f4fa7f3807d41b78056c6648b04bfadd737df21e.

Revert "rds: signedness bug"
This reverts commit 9b9d2e00bfa592aceda7b43da76c670df61faa97.

Revert "RDS: Remove dead struct from rds.h"
This reverts commit 905d64c89e2a9d71d0606904b7c3908633db6072.

Revert "RDS: rds.h: Replace u_int[size]_t with uint[size]_t"
This reverts commit a46f561b774d90d8616473d56696e7d44fa1c9f1.

Revert "RDS: Add rds.h to exported headers list" for include/linux/rds.h
This reverts commit fd128dfa50cfc4f2959dc4aa5d7468d33b988332 for include/linux/rds.h.

Revert "RDS: Implement masked atomic operations"
This reverts commit 20c72bd5f5f902e5a8745d51573699605bf8d21c.

Revert "RDS/IB: print string constants in more places"
This reverts commit 59f740a6aeb2cde2f79fe0df38262d4c1ef35cd8.

Revert "RDS: cancel connection work structs as we shut down"
This reverts commit 4518071ac1bcb76c64a55a3fddb39fb3d39add41.

Revert "RDS: don't call rds_conn_shutdown() from rds_conn_destroy()"
This reverts commit ffcec0e110c198717eb0f6ac000c1e5397db9451.

Revert "RDS: have sockets get transport module references"
This reverts commit 5adb5bc65f93e52341c3fc9d03d4030dd375e256.

Revert "RDS: remove old rs_transport comment"
This reverts commit 77510481c0c3980c8979ed236d63e59221fb8ce5.

Revert "RDS: lock rds_conn_count decrement in rds_conn_destroy()"
This reverts commit fe8ff6b58f040dd52d2db45972db8e0301847f1c.

Revert "RDS/IB: protect the list of IB devices"
This reverts commit ea819867b788728aca60717e4fdacb3df771f670.

Revert "RDS/IB: print IB event strings as well as their number"
This reverts commit 1bde04a63d532c2540d6fdee0a661530a62b1686.

Revert "RDS: flush fmrs before allocating new ones"
This reverts commit 8576f374ac9537674e3cccb0a9d43fa2b7ebbf5b.

Revert "RDS: properly use sg_init_table"
This reverts commit b4e1da3c9a0ac9b01f45a8578b7347e3a31f9fb8.

Revert "RDS/IB: track signaled sends"
This reverts commit f046011cd73c372267befd10242988eb744649fe.

Revert "RDS: remove __init and __exit annotation"
This reverts commit ef87b7ea39a91906218a262686bcb8bad8b6b46e.

Revert "RDS/IB: Use SLAB_HWCACHE_ALIGN flag for kmem_cache_create()"
This reverts commit c20f5b9633bb0953bd2422f0f1430a2028cdbd0a.

Revert "RDS/IB: always process recv completions"
This reverts commit d455ab64096b9a86849c7315c53e595330842db6.

Revert "RDS: return to a single-threaded krdsd"
This reverts commit 80c51be56ffa257d3177f0d750d90be65d30c22f.

Revert "RDS/IB: create a work queue for FMR flushing"
This reverts commit 515e079dab19cf774d1eec6e5f4ed65509e31ef1.

Revert "RDS/IB: destroy connections on rmmod"
This reverts commit 8aeb1ba6630ffd44001ae9833842794df0107676.

Revert "RDS/IB: wait for IB dev freeing work to finish during rmmod"
This reverts commit 24fa163a4bae74b3378d30e1bc776568cfca8121.

Revert "RDS/IB: Make ib_recv_refill return void"
This reverts commit b6fb0df12db6c8b6bbcc7b5c9459b3bbf5f0cee6.

Revert "RDS: Remove unused XLIST_PTR_TAIL and xlist_protect()"
This reverts commit fbf4d7e3d03587a983ee4e536251ea6c1c848ec2.

Revert "RDS: whitespace"
This reverts commit c9455d9996ba84af1f534c7e3944ea6f35d2fc54.

Revert "RDS: use delayed work for the FMR flushes"
This reverts commit 7a0ff5dbdd0b4cb7ea8764da9d78f4bb2eebaf31.

Revert "rds: more FMRs are faster"
This reverts commit eabb732279f1a41ac9d066aeb56973ae505c4cbc.

Revert "rds: recycle FMRs through lockless lists"
This reverts commit 6fa70da6081bbcf948801fd5ee0be4d222298a43.

Revert "rds: fix rds_send_xmit() serialization"
This reverts commit 0f4b1c7e89e699f588807a914ec6e6396c851a72.

Revert "rds: block ints when acquiring c_lock in rds_conn_message_info()"
This reverts commit 501dcccdb7a2335cde07d4acb56e636182d62944.

Revert "rds: remove unused rds_send_acked_before()"
This reverts commit 671202f3491cccdb267f88ad59ba0635aeb2a22e.

Revert "RDS: use friendly gfp masks for prefill"
This reverts commit 037f18a3074753991656189a091a5fa371999107.

Revert "RDS/IB: Add caching of frags and incs"
This reverts commit 33244125871734ebc0d8d147680a0d7e99385e0b.

Revert "RDS/IB: Remove ib_recv_unmap_page()"
This reverts commit fc24f78085e8771670af42f2b8929b16a0c98a22.

Revert "RDS: Assume recv->r_frag is always NULL in refill_one()"
This reverts commit 3427e854e1a0e76be8b3d75fc0fa878f59b43693.

Revert "RDS: Use page_remainder_alloc() for recv bufs"
This reverts commit 0b088e003ccf316a76c51be5dec2d70b93be3be8.

Revert "RDS/IB: disconnect when IB devices are removed"
This reverts commit fc19de38be924728fea76026c0d1a6c4b6156084.

Revert "RDS: introduce rds_conn_connect_if_down()"
This reverts commit f3c6808d3d8513db2b0543538fc35c25a60fe7a7.

Revert "RDS/IB: add refcount tracking to struct rds_ib_device"
This reverts commit 3e0249f9c05cb77b66f7f09644ca9ca208d991a9.

Revert "RDS/IB: get the xmit max_sge from the RDS IB device on the connection"
This reverts commit 89bf9d4158b5a1b6bd00960eb2e47601ec8cc138.

Revert "RDS/IB: rds_ib_cm_handle_connect() forgot to unlock c_cm_lock"
This reverts commit a46ca94e7fb2c93a59e08b42fd77d8c478fda5fc.

Revert "rds: Fix reference counting on the for xmit_atomic and xmit_rdma"
This reverts commit 1cc2228c599f173d77000a250bf0541294e1a7be.

Revert "rds: use RCU to protect the connection hash"
This reverts commit bcf50ef2ce3c5d8f2fe995259da16677898cb300.

Revert "RDS: use locking on the connection hash list"
This reverts commit abf454398c2ebafc629ebb8b149f5a752c79e919.

Revert "rds: Fix RDMA message reference counting"
This reverts commit c9e65383a20d9a656db70efbf67e57f8115ad776.

Revert "rds: don't let RDS shutdown a connection while senders are present"
This reverts commit 7e3f2952eeb1a0fe2aa9882fd1705a88f9d89b35.

Revert "rds: Use RCU for the bind lookup searches"
This reverts commit 38a4e5e61344490f18241333d7b1b368a3a38748.

Revert "RDS/IB: add _to_node() macros for numa and use {k,v}malloc_node()"
This reverts commit e4c52c98e04937ea87b0979a81354d0040d284f9.

Revert "RDS/IB: Remove unused variable in ib_remove_addr()"
This reverts commit 4a81802b5e5e0b059627d7173c917711cf35e668.

Revert "rds: rcu-ize rds_ib_get_device()"
This reverts commit 764f2dd92f5cd308d1c4372b33fea2b265c093f5.

Revert "rds: per-rm flush_wait waitq"
This reverts commit c83188dcd76b1f0c17c31b4bbd8de57c634b19f8.

Revert "rds: switch to rwlock on bind_lock"
This reverts commit 976673ee1b92d939168c8c1fbad3e16c45caa545.

Revert "RDS: Update comments in rds_send_xmit()"
This reverts commit ce47f52f42e69d48d1b63fa618fce9cd7ffa9417.

Revert "RDS: Use a generation counter to avoid rds_send_xmit loop"
This reverts commit 9e29db0e3645cafa980e68a9c717a761448389e1.

Revert "RDS: Get pong working again"
This reverts commit acfcd4d4ec4ed8cb504f96d4fabb7a94029b362b.

Revert "RDS: Do wait_event_interruptible instead of wait_event"
This reverts commit a40aa9233aa22d69212d02f92e5b607bd4d658f4.

Revert "RDS: Remove send_quota from send_xmit()"
This reverts commit fcc5450c6386526034edc437e4cb2c67a6fdd7e9.

Revert "RDS: Move atomic stats from general to ib-specific area"
This reverts commit 51e2cba8b5936c13b40f0fa11aa4e84683dbc751.

Revert "RDS: rds_message_unmapped() doesn't need to check if queue active"
This reverts commit ab1a6926f589c51e7a57ce7544d85272c4acc854.

Revert "RDS: Fix locking in send on m_rs_lock"
This reverts commit cf4b7389ee812817deeb11da1422004e01b50646.

Revert "RDS: Use NOWAIT in message_map_pages()"
This reverts commit f2ec76f288118fb18449402d75383212cbcb6762.

Revert "RDS: Bypass workqueue when queueing cong updates"
This reverts commit 2fa57129df61bf3fb7d90c5486fe15df94091f61.

Revert "RDS: Call rds_send_xmit() directly from sendmsg()"
This reverts commit a7d3a281483684f77e350b045af7f80a149fc4c7.

Revert "RDS: rds_send_xmit() locking/irq fixes"
This reverts commit 2ad8099b58f274dc23bc866ca259d7e5db87fa1a.

Revert "RDS: Change send lock from a mutex to a spinlock"
This reverts commit 049ee3f500954176a87f22e6ee3e98aecb1b8958.

Revert "RDS: Refill recv ring directly from tasklet"
This reverts commit f17a1a55fb672d7f64be7f2e940ef5669e5efa0a.

Revert "RDS: Stop supporting old cong map sending method"
This reverts commit 77dd550e5547846604ff6f90c4dc6bba4414e485.

Revert "RDS/IB: Do not wait for send ring to be empty on conn shutdown"
This reverts commit e32b4a70495aac6a612e13f4c21db09dd756ff2c.

Revert "RDS: Perform unmapping ops in stages"
This reverts commit ff3d7d36134ef7138803734fdbf91cc986ea7976.

Revert "RDS: Make sure cmsgs aren't used in improper ways"
This reverts commit aa0a4ef4ac3a3c5ffa35e32520bfbc0922ef3630.

Revert "RDS: Add flag for silent ops. Do atomic op before RDMA"
This reverts commit 2c3a5f9abb1dc5efdab8ba9a568b1661c65fd1e3.

Revert "RDS: Move some variables around for consistency"
This reverts commit 7e3bd65ebfd5d6cd76b8b979920c632d6e6b4b2a.

Revert "RDS: queue failure notifications for dropped atomic ops"
This reverts commit 940786eb0a0faf3f30898a1cc7c1540d54c1aff6.

Revert "RDS: Add a warning if trying to allocate 0 sgs"
This reverts commit ee4c7b47e46a9dea789aadb8279c8505f755b3ee.

Revert "RDS: Do not set op_active in r_m_copy_from_user()."
This reverts commit 372cd7dedfd1ea93a9ae8d9c282e910dc1b76773.

Revert "RDS: Rewrite rds_send_xmit"
This reverts commit 5b2366bd2835919e2e6a836e837eab4a9274bd46.

Revert "RDS: Rename data op members prefix from m_ to op_"
This reverts commit 6c7cc6e4694dc464ae884332f2a322973497e3cf.

Revert "RDS: Remove struct rds_rdma_op"
This reverts commit f8b3aaf2ba8ca9e27b47f8bfdff07c8b968f2c05.

Revert "RDS: purge atomic resources too in rds_message_purge()"
This reverts commit d0ab25a83c4a08cd98b73a37d3f4c069f7b4f50b.

Revert "RDS: Inline rdma_prepare into cmsg_rdma_args"
This reverts commit 4324879df06ba4db01a0b455af2d003f117e6aa3.

Revert "RDS: Implement silent atomics"
This reverts commit 241eef3e2f51fe4ad50abacd7f79c4e2d468197e.

Revert "RDS: Move loop-only function to loop.c"
This reverts commit d37c9359056f4f07b37e59810f0ece1031e280b2.

Revert "RDS/IB: Make all flow control code conditional on i_flowctl"
This reverts commit c8de3f1005e8359ea07083e37f3f993646e1adba.

Revert "RDS: Remove unsignaled_bytes sysctl"
This reverts commit 1d34f175712b59ad292ecbbaa8fc05402a1fd8ed.

Revert "RDS: rewrite rds_ib_xmit"
This reverts commit da5a06cef5724737af4315715632f0a07dd5e116.

Revert "RDS/IB: Remove ib_[header/data]_sge() functions"
This reverts commit 919ced4ce7d6ac62dd5be62d8993fe22a527d53a.

Revert "RDS/IB: Remove dead code"
This reverts commit 6f3d05db0da0b874afd2dd229bed715133532f8d.

Revert "RDS/IB: Disallow connections less than RDS 3.1"
This reverts commit f147dd9ecabf23fd63d2562ffe64252a0453ecde.

Revert "RDS/IB: eliminate duplicate code"
This reverts commit 9c030391e8741695ff6114703e4edccccb634479.

Revert "RDS: inc_purge() transport function unused - remove it"
This reverts commit 809fa148a29467954280fe8b7f97c92403f6293c.

Revert "RDS: Whitespace"
This reverts commit 6200ed7799d9225f363f157ab61f1566cfd80e19.

Revert "RDS: Do not mask address when pinning pages"
This reverts commit d22faec22c2ab2364fd8fc3c8159b0b5b28b0fd1.

Revert "RDS: Base init_depth and responder_resources on hw values"
This reverts commit 40589e74f7ba855f3a887c9d4abe9d100c5b039c.

Revert "RDS: Implement atomic operations"
This reverts commit 15133f6e67d8d646d0744336b4daa3135452cb0d.

Revert "RDS: Clear up some confusing code in send_remove_from_sock"
This reverts commit a63273d4992603979ddb181b6a8f07082839b39f.

Revert "RDS: make sure all sgs alloced are initialized"
This reverts commit f4dd96f7b27743e568cec519eff0f951c56833c6.

Revert "RDS: make m_rdma_op a member of rds_message"
This reverts commit ff87e97a9d70c9ae133d3d3d7792b26ab85f4297.

Revert "RDS: fold rdma.h into rds.h"
This reverts commit 21f79afa5fda2820671a8f64c3d0e43bb118053b.

Revert "RDS: Explicitly allocate rm in sendmsg()"
This reverts commit fc445084f185cdd877bec323bfe724a361e2292a.

Revert "RDS: cleanup/fix rds_rdma_unuse"
This reverts commit 3ef13f3c22aaea28aff383cb0883481d24885456.

Revert "RDS: break out rdma and data ops into nested structs in rds_message"
This reverts commit e779137aa76d38d5c33a98ed887092ae4e4f016f.

Revert "RDS: cleanup: remove "== NULL"s and "!= NULL"s in ptr comparisons"
This reverts commit 8690bfa17aea4c42da1bcf90a7af93d161eca624.

Revert "RDS: move rds_shutdown_worker impl. to rds_conn_shutdown"
This reverts commit 2dc393573430f853e56e25bf4b41c34ba2aa8fd6.

Revert "RDS: Fix locking in send on m_rs_lock"
This reverts commit 9de0864cf55927a7383b5ba6e48834ff3ef053de.

Revert "RDS: Rewrite rds_send_drop_to() for clarity"
This reverts commit 7c82eaf00ec7d460932be9314b29997006b799b6.

Revert "RDS: Fix corrupted rds_mrs"
This reverts commit 35b52c70534cb7193b218ec12efe6bc595312097.

Revert "RDS: Fix BUG_ONs to not fire when in a tasklet"
This reverts commit 9e2effba2c16fc3bd47da605116485afe01e0be0.

Revert "net: rds: remove duplication type definitions"
This reverts commit fcb12fd2236f49aa8fdc1568ed4ebdfe4fddc6b5.

Revert "rds: fix a leak of kernel memory"
This reverts commit f037590fff3005ce8a1513858d7d44f50053cc8f.

Revert "net: use __packed annotation" for include/linux/rds.h
This reverts commit bc10502dba37d3b210efd9f3867212298f13b78e for include/linux/rds.h.

Revert "net/rds: Add missing mutex_unlock"
This reverts commit 5daf47bb4e708fde32c1856a0d049e3c3d03c36c.

Revert "net: Remove unnecessary semicolons after switch statements" for net/rds
This reverts commit ccbd6a5a4f76e821ed36f69fdaf59817c3a7f18e for net/rds.

Revert "rdma: potential ERR_PTR dereference"
This reverts commit 24acc6895616b373475e92e49925efc3ef591563.

Revert "net: sk_sleep() helper" for net/rds
This reverts commit aa395145165cb06a0d0885221bbe0ce4a564391d for net/rds.

Revert "include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h" for net/rds
This reverts commit 5a0e3ad6af8660be21ca98a971cd00f331318c05 for net/rds.

Revert "rds: cleanup: remove unneeded variable"
This reverts commit 18062ca94714a66e75da8a22e010d0e8e61ab4cd.

Revert "RDS: Enable per-cpu workqueue threads"
This reverts commit 768bbedf9ca4cc4784eae2003f37abe0818fe0b0.

Revert "RDS: Do not call set_page_dirty() with irqs off"
This reverts commit 561c7df63e259203515509a7ad075382a42bff0c.

Revert "RDS: Properly unmap when getting a remote access error"
This reverts commit 450d06c0208ad195ccd74a7edd11321e316791ad.

Revert "RDS: only put sockets that have seen congestion on the poll_waitq"
This reverts commit b98ba52f96e7cdb4dbe2b06bced83d95d94c9d02.

Revert "RDS: Fix locking in rds_send_drop_to()"
This reverts commit 550a8002e4340eaf3bc333e33b59427e9c20272d.

Revert "RDS: Turn down alarming reconnect messages"
This reverts commit 97069788d6784ac92d050557a02f6e7bf4d1f53d.

Revert "RDS: Workaround for in-use MRs on close causing crash"
This reverts commit 571c02fa81e43ebb4b793f626d6c7bf0fa18902b.

Revert "RDS: Fix send locking issue"
This reverts commit 048c15e641289d902f7ef9f1241068d8045e210c.

Revert "RDS: Fix congestion issues for loopback"
This reverts commit 2e7b3b994529d4760231a45a6b88950187bda877.

Revert "RDS/TCP: Wait to wake thread when write space available"
This reverts commit 8e82376e5f72bb576504c8c6117685e56c1b97db.

Revert "RDS: update copy_to_user state in tcp transport"
This reverts commit b075cfdb666d6fa90c55c8619186398a3c4fd865.

Revert "RDS: sendmsg() should check sndtimeo, not rcvtimeo"
This reverts commit 1123fd734df6ad82373a5a27f0f2ed3115555b9d.

Revert "RDS: Do not BUG() on error returned from ib_post_send"
This reverts commit 735f61e62611161588123930823af6e6a9fd5c2c.

Revert "net/rds: remove uses of NIPQUAD, use %pI4"
This reverts commit 6884b348ed759184032306c9435a727741a72298.

Revert "net: Move && and || to end of previous line" for net/rds
This reverts commit f64f9e719261a87818dd192a3a2352e5b20fbd0f for net/rds.

Revert "RDMA/cm: fix loopback address support" for net/rds
This reverts commit 6f8372b69c3198e06cecb1df2cb9682d0c55e657 for net/rds.

Revert "sysctl: Drop & in front of every proc_handler." for net/rds
This reverts commit 6d4561110a3e9fa742aeec6717248a491dfb1878 for net/rds.

Revert "sysctl net: Remove unused binary sysctl code" for net/rds
This reverts commit f8572d8f2a2ba75408b97dc24ef47c83671795d7 for net/rds.

Revert "net: pass kern to net_proto_family create function" for net/rds
This reverts commit 3f378b684453f2a028eda463ce383370545d9cc9 for net/rds.

Revert "RDS/IB+IW: Move recv processing to a tasklet"
This reverts commit d521b63b27e3a397e0ef7ca86b6e813861083c83.

Revert "RDS: Do not send congestion updates to loopback connections"
This reverts commit 0514f8a9c0cbd26afa70dc56406cc0ee1e134dcf.

Revert "RDS: Fix panic on unload"
This reverts commit 433d308dd85e506bb6529177cc0f997627d87ed6.

Revert "RDS: Fix potential race around rds_i[bw]_allocation"
This reverts commit 86357b19bcabd9355937f3fb84f90ba9fe76a5d3.

Revert "RDS: Add GET_MR_FOR_DEST sockopt"
This reverts commit 244546f0d3101c5441f5b14cfe8a79d62679eaea.

Revert "inet: rename some inet_sock fields" for net/rds
This reverts commit c720c7e8383aff1cb219bddf474ed89d850336e3 for net/rds.

Revert "net: mark net_proto_ops as const" for net/rds
This reverts commit ec1b4cf74c81bfd0fbe5bf62bafc86c45917e72f for net/rds.

Revert "net: Make setsockopt() optlen be unsigned." for net/rds
This reverts commit b7058842c940ad2c08dd829b21e5c92ebe3b8758 for net/rds.

Revert "net: constify remaining proto_ops" for net/rds
This reverts commit 5708e868dc512f055f0ea4a14d01f8252c3ca8a1 for net/rds.

Revert "RDS: Add a debug message suggesting to load transport modules"
This reverts commit f2c449320d547bd5c281649eb1d99afb20765144.

Revert "RDS: Track transports via an array, not a list"
This reverts commit 335776bd696a6bf95134baf8ad95847371e4d5f6.

Revert "RDS: Modularize RDMA and TCP transports"
This reverts commit 40d866095df3bb70ded1813f4852cab445ef678b.

Revert "RDS: Export symbols from core RDS"
This reverts commit 616b757ae18fb8ec2dfe7ff9d3f589f82cb0eb9d.

Revert "RDS: Add TCP transport to RDS"
This reverts commit 70041088e3b976627ba9a183b812f39ef8a9ba0e.

Revert "net/rds: Use AF_INET for sin_family field"
This reverts commit 3d7ddd540b4c2d24c6a3e7a52c083a0c31e6151c.

Revert "net: mark read-only arrays as const" for net/rds
This reverts commit 36cbd3dcc10384f813ec0814255f576c84f2bcd4 for net/rds.

Revert "RDS: Refactor end of __conn_create for readability"
This reverts commit cb24405e67e56cbef51b5e4d0bb0a0fde167261f.

Revert "RDS/IW: Remove dead code"
This reverts commit ed9e352a350ec85eb354046e0db6a86019620f53.

Revert "RDS/IW: Remove page_shift variable from iwarp transport"
This reverts commit 404bb72a56e553febe1055f98347a7a3e3145759.

Revert "RDS/IB: Always use PAGE_SIZE for FMR page size"
This reverts commit a870d62726721785c34fa73d852bd35e5d1b295b.

Revert "RDS: Fix completion notifications on blocking sockets"
This reverts commit edacaeae52ade6cbb3a0704db32a9fb4a219dee3.

Revert "RDS/IB: Drop connection when a fatal QP event is received"
This reverts commit fdf6e6b4afd8a56fa58f70a3950bd7ea7fbaef5f.

Revert "RDS/IB: Disable flow control in sysctl and explain why"
This reverts commit 68cb01c1ba312add7c7cc7da1bbe98b3071904d1.

Revert "RDS/IB: Move tx/rx ring init and refill to later"
This reverts commit e11d912a7dd4dfe388f38ba3080a6d067a57b23d.

Revert "RDS: Don't set c_version in __rds_conn_create()"
This reverts commit 9099707ded4b3aeda7b1a6c1c87076bd18578d24.

Revert "RDS/IB: Rename byte_len to data_len to enhance readability"
This reverts commit 597ddd50e1c07ac55ac7742442690efcf16a37f5.

Revert "RDS/RDMA: Fix cut-n-paste errors in printks in rdma_transport.c"
This reverts commit 92c330b9e93ce70a8c45a6b8b0a551321d783feb.

Revert "RDS/IB: Fix printk to indicate remote IP, not local"
This reverts commit 8dacd57e7ebc307d4d7c27c5d1caada4c4e63ebd.

Revert "RDS/IB: Handle connections using RDS 3.0 wire protocol"
This reverts commit 02a6a2592e41d27644d647f3bce23598649961bc.

Revert "RDS/IB: Improve RDS protocol version checking"
This reverts commit 9ddbcfa098bae757d3760dd1dbf2847a0bd5a525.

Revert "RDS: Set retry_count to 2 and make modifiable via modparam"
This reverts commit 3ba23ade464cca7c4a7ba5628c613339d3f2e161.

Revert "percpu: use DEFINE_PER_CPU_SHARED_ALIGNED()" for net/rds
This reverts commit b9bf3121af348d9255f1c917830fe8c2df52efcb for net/rds.

Revert "FRV: Fix the section attribute on UP DECLARE_PER_CPU()" for net/rds
This reverts commit 9b8de7479d0dbab1ed98b5b015d44232c9d3d08e for net/rds.

Revert "ERR_PTR() dereference in net/rds/ib.c"
This reverts commit 94713bab649736177a1c33a39b7bb33cbd5af3a5.

Revert "ERR_PTR() dereference in net/rds/iw.c"
This reverts commit 5d57eeb52ae71a03c8e083a9b0a818a9b63ca440.

Revert "rds: use kmem_cache_zalloc instead of kmem_cache_alloc/memset"
This reverts commit 05a178ecdc7396b78dfbb5d8bda65108b37b8672.

Revert "RDS: remove unused #include <version.h>"
This reverts commit 9c56a84478b708e5d8d34d28cc3a8e71842d5b05.

Revert "RDS: use get_user_pages_fast()"
This reverts commit 830eb7d56c18ff4c29acd8b0bb48db404660321f.

Revert "RDS: Establish connection before parsing CMSGs"
This reverts commit 49f696914100780f6bf8e652d3468de0015d6172.

Revert "RDS: Fix ordering in a conditional"
This reverts commit 7acd4a794c1530af063e51f3f7171e75556458f3.

Revert "RDS/IW+IB: Allow max credit advertise window."
This reverts commit 7b70d0336da777c00395cc7a503497c2cdabd1a8.

Revert "RDS/IW+IB: Set the RDS_LL_SEND_FULL bit when we're throttled."
This reverts commit d39e0602bb987133321d358d9b837d67c27b223d.

Revert "RDS: Correct some iw references in rdma_transport.c"
This reverts commit 11bc9421da3040c71fc96da1a31e95217e8cf2af.

Revert "RDS/IW+IB: Set recv ring low water mark to 1/2 full."
This reverts commit 5cd2fe6d54c91aa76893b3034f5f3473063c0202.

-------

Signed-off-by: Ajaykumar Hotchandani <ajaykumar.hotchandani@oracle.com>
(Added revert details)
Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
47 files changed:
include/linux/rds.h [moved from include/uapi/linux/rds.h with 80% similarity]
net/rds/Kconfig
net/rds/Makefile
net/rds/af_rds.c
net/rds/bind.c
net/rds/cong.c
net/rds/connection.c
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/ib_rdma.c
net/rds/ib_recv.c
net/rds/ib_ring.c
net/rds/ib_send.c
net/rds/ib_stats.c
net/rds/ib_sysctl.c
net/rds/info.c
net/rds/iw.c
net/rds/iw.h
net/rds/iw_cm.c
net/rds/iw_rdma.c
net/rds/iw_recv.c
net/rds/iw_ring.c
net/rds/iw_send.c
net/rds/iw_stats.c
net/rds/iw_sysctl.c
net/rds/loop.c
net/rds/message.c
net/rds/page.c
net/rds/rdma.c
net/rds/rdma.h [new file with mode: 0644]
net/rds/rdma_transport.c
net/rds/rdma_transport.h
net/rds/rds.h
net/rds/recv.c
net/rds/send.c
net/rds/stats.c
net/rds/sysctl.c
net/rds/tcp.c [deleted file]
net/rds/tcp.h [deleted file]
net/rds/tcp_connect.c [deleted file]
net/rds/tcp_listen.c [deleted file]
net/rds/tcp_recv.c [deleted file]
net/rds/tcp_send.c [deleted file]
net/rds/tcp_stats.c [deleted file]
net/rds/threads.c
net/rds/transport.c

similarity index 80%
rename from include/uapi/linux/rds.h
rename to include/linux/rds.h
index 91950950aa598060a8e0e370f82654cd9a75e7d6..d91dc91f544302a05765126308a345c1dac1c686 100644 (file)
 
 #include <linux/types.h>
 
+/* These sparse annotated types shouldn't be in any user
+ * visible header file. We should clean this up rather
+ * than kludging around them. */
+#ifndef __KERNEL__
+#define __be16 u_int16_t
+#define __be32 u_int32_t
+#define __be64 u_int64_t
+#endif
+
 #define RDS_IB_ABI_VERSION             0x301
 
 /*
@@ -47,7 +56,6 @@
 /* deprecated: RDS_BARRIER 4 */
 #define RDS_RECVERR                    5
 #define RDS_CONG_MONITOR               6
-#define RDS_GET_MR_FOR_DEST            7
 
 /*
  * Control message types for SOL_RDS.
 #define RDS_CMSG_RDMA_MAP              3
 #define RDS_CMSG_RDMA_STATUS           4
 #define RDS_CMSG_CONG_UPDATE           5
-#define RDS_CMSG_ATOMIC_FADD           6
-#define RDS_CMSG_ATOMIC_CSWP           7
-#define RDS_CMSG_MASKED_ATOMIC_FADD    8
-#define RDS_CMSG_MASKED_ATOMIC_CSWP    9
 
 #define RDS_INFO_FIRST                 10000
 #define RDS_INFO_COUNTERS              10000
@@ -93,8 +97,8 @@
 #define RDS_INFO_LAST                  10010
 
 struct rds_info_counter {
-       uint8_t name[32];
-       uint64_t        value;
+       u_int8_t        name[32];
+       u_int64_t       value;
 } __attribute__((packed));
 
 #define RDS_INFO_CONNECTION_FLAG_SENDING       0x01
@@ -104,47 +108,43 @@ struct rds_info_counter {
 #define TRANSNAMSIZ    16
 
 struct rds_info_connection {
-       uint64_t        next_tx_seq;
-       uint64_t        next_rx_seq;
+       u_int64_t       next_tx_seq;
+       u_int64_t       next_rx_seq;
        __be32          laddr;
        __be32          faddr;
-       uint8_t transport[TRANSNAMSIZ];         /* null term ascii */
-       uint8_t flags;
+       u_int8_t        transport[TRANSNAMSIZ];         /* null term ascii */
+       u_int8_t        flags;
+} __attribute__((packed));
+
+struct rds_info_flow {
+       __be32          laddr;
+       __be32          faddr;
+       u_int32_t       bytes;
+       __be16          lport;
+       __be16          fport;
 } __attribute__((packed));
 
 #define RDS_INFO_MESSAGE_FLAG_ACK               0x01
 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
 
 struct rds_info_message {
-       uint64_t        seq;
-       uint32_t        len;
+       u_int64_t       seq;
+       u_int32_t       len;
        __be32          laddr;
        __be32          faddr;
        __be16          lport;
        __be16          fport;
-       uint8_t flags;
+       u_int8_t        flags;
 } __attribute__((packed));
 
 struct rds_info_socket {
-       uint32_t        sndbuf;
+       u_int32_t       sndbuf;
        __be32          bound_addr;
        __be32          connected_addr;
        __be16          bound_port;
        __be16          connected_port;
-       uint32_t        rcvbuf;
-       uint64_t        inum;
-} __attribute__((packed));
-
-struct rds_info_tcp_socket {
-       __be32          local_addr;
-       __be16          local_port;
-       __be32          peer_addr;
-       __be16          peer_port;
-       uint64_t       hdr_rem;
-       uint64_t       data_rem;
-       uint32_t       last_sent_nxt;
-       uint32_t       last_expected_una;
-       uint32_t       last_seen_una;
+       u_int32_t       rcvbuf;
+       u_int64_t       inum;
 } __attribute__((packed));
 
 #define RDS_IB_GID_LEN 16
@@ -199,69 +199,35 @@ struct rds_info_rdma_connection {
  * (so that the application does not have to worry about
  * alignment).
  */
-typedef uint64_t       rds_rdma_cookie_t;
+typedef u_int64_t      rds_rdma_cookie_t;
 
 struct rds_iovec {
-       uint64_t        addr;
-       uint64_t        bytes;
+       u_int64_t       addr;
+       u_int64_t       bytes;
 };
 
 struct rds_get_mr_args {
        struct rds_iovec vec;
-       uint64_t        cookie_addr;
+       u_int64_t       cookie_addr;
        uint64_t        flags;
 };
 
-struct rds_get_mr_for_dest_args {
-       struct sockaddr_storage dest_addr;
-       struct rds_iovec        vec;
-       uint64_t                cookie_addr;
-       uint64_t                flags;
-};
-
 struct rds_free_mr_args {
        rds_rdma_cookie_t cookie;
-       uint64_t        flags;
+       u_int64_t       flags;
 };
 
 struct rds_rdma_args {
        rds_rdma_cookie_t cookie;
        struct rds_iovec remote_vec;
-       uint64_t        local_vec_addr;
-       uint64_t        nr_local;
-       uint64_t        flags;
-       uint64_t        user_token;
-};
-
-struct rds_atomic_args {
-       rds_rdma_cookie_t cookie;
-       uint64_t        local_addr;
-       uint64_t        remote_addr;
-       union {
-               struct {
-                       uint64_t        compare;
-                       uint64_t        swap;
-               } cswp;
-               struct {
-                       uint64_t        add;
-               } fadd;
-               struct {
-                       uint64_t        compare;
-                       uint64_t        swap;
-                       uint64_t        compare_mask;
-                       uint64_t        swap_mask;
-               } m_cswp;
-               struct {
-                       uint64_t        add;
-                       uint64_t        nocarry_mask;
-               } m_fadd;
-       };
-       uint64_t        flags;
-       uint64_t        user_token;
+       u_int64_t       local_vec_addr;
+       u_int64_t       nr_local;
+       u_int64_t       flags;
+       u_int64_t       user_token;
 };
 
 struct rds_rdma_notify {
-       uint64_t        user_token;
+       u_int64_t       user_token;
        int32_t         status;
 };
 
@@ -280,6 +246,5 @@ struct rds_rdma_notify {
 #define RDS_RDMA_USE_ONCE      0x0008  /* free MR after use */
 #define RDS_RDMA_DONTWAIT      0x0010  /* Don't wait in SET_BARRIER */
 #define RDS_RDMA_NOTIFY_ME     0x0020  /* Notify when operation completes */
-#define RDS_RDMA_SILENT                0x0040  /* Do not interrupt remote */
 
 #endif /* IB_RDS_H */
index f2c670ba7b9b2b26592e1b07bbedf0c5b8b3b842..796773b5df9b819c1bf33f90654556187264fe9d 100644 (file)
@@ -1,28 +1,14 @@
 
 config RDS
-       tristate "The RDS Protocol"
-       depends on INET
+       tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
+       depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
+       depends on INFINIBAND && INFINIBAND_ADDR_TRANS
        ---help---
-         The RDS (Reliable Datagram Sockets) protocol provides reliable,
-         sequenced delivery of datagrams over Infiniband, iWARP,
-         or TCP.
-
-config RDS_RDMA
-       tristate "RDS over Infiniband and iWARP"
-       depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
-       ---help---
-         Allow RDS to use Infiniband and iWARP as a transport.
-         This transport supports RDMA operations.
-
-config RDS_TCP
-       tristate "RDS over TCP"
-       depends on RDS
-       ---help---
-         Allow RDS to use TCP as a transport.
-         This transport does not support RDMA operations.
+         RDS provides reliable, sequenced delivery of datagrams
+         over Infiniband.
 
 config RDS_DEBUG
-        bool "RDS debugging messages"
+        bool "Debugging messages"
        depends on RDS
         default n
 
index 56d3f6023ced41adb81510ec0dc8525c9230dea9..51f27585fa083f840d228f59c7a2b1968c6ab095 100644 (file)
@@ -1,19 +1,14 @@
 obj-$(CONFIG_RDS) += rds.o
 rds-y :=       af_rds.o bind.o cong.o connection.o info.o message.o   \
                        recv.o send.o stats.o sysctl.o threads.o transport.o \
-                       loop.o page.o rdma.o
-
-obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
-rds_rdma-y :=  rdma_transport.o \
+                       loop.o page.o rdma.o \
+                       rdma_transport.o \
                        ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
                        ib_sysctl.o ib_rdma.o \
                        iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
                        iw_sysctl.o iw_rdma.o
 
-
-obj-$(CONFIG_RDS_TCP) += rds_tcp.o
-rds_tcp-y :=           tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
-                       tcp_send.o tcp_stats.o
-
-ccflags-$(CONFIG_RDS_DEBUG)    :=      -DDEBUG
+ifeq ($(CONFIG_RDS_DEBUG), y)
+EXTRA_CFLAGS += -DDEBUG
+endif
 
index 10443377fb9d8f5b5cb928647fa58c03001a072b..20cf16fc572f20594fa05d25ee4554a4f30d3d0a 100644 (file)
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/gfp.h>
 #include <linux/in.h>
 #include <linux/poll.h>
+#include <linux/version.h>
 #include <net/sock.h>
 
 #include "rds.h"
-
-char *rds_str_array(char **array, size_t elements, size_t index)
-{
-       if ((index < elements) && array[index])
-               return array[index];
-       else
-               return "unknown";
-}
-EXPORT_SYMBOL(rds_str_array);
+#include "rdma.h"
+#include "rdma_transport.h"
 
 /* this is just used for stats gathering :/ */
 static DEFINE_SPINLOCK(rds_sock_lock);
@@ -68,8 +61,9 @@ static int rds_release(struct socket *sock)
 {
        struct sock *sk = sock->sk;
        struct rds_sock *rs;
+       unsigned long flags;
 
-       if (!sk)
+       if (sk == NULL)
                goto out;
 
        rs = rds_sk_to_rs(sk);
@@ -80,25 +74,15 @@ static int rds_release(struct socket *sock)
         * with the socket. */
        rds_clear_recv_queue(rs);
        rds_cong_remove_socket(rs);
-
-       /*
-        * the binding lookup hash uses rcu, we need to
-        * make sure we synchronize_rcu before we free our
-        * entry
-        */
        rds_remove_bound(rs);
-       synchronize_rcu();
-
        rds_send_drop_to(rs, NULL);
        rds_rdma_drop_keys(rs);
        rds_notify_queue_get(rs, NULL);
 
-       spin_lock_bh(&rds_sock_lock);
+       spin_lock_irqsave(&rds_sock_lock, flags);
        list_del_init(&rs->rs_item);
        rds_sock_count--;
-       spin_unlock_bh(&rds_sock_lock);
-
-       rds_trans_put(rs->rs_transport);
+       spin_unlock_irqrestore(&rds_sock_lock, flags);
 
        sock->sk = NULL;
        sock_put(sk);
@@ -175,10 +159,9 @@ static unsigned int rds_poll(struct file *file, struct socket *sock,
        unsigned int mask = 0;
        unsigned long flags;
 
-       poll_wait(file, sk_sleep(sk), wait);
+       poll_wait(file, sk->sk_sleep, wait);
 
-       if (rs->rs_seen_congestion)
-               poll_wait(file, &rds_poll_waitq, wait);
+       poll_wait(file, &rds_poll_waitq, wait);
 
        read_lock_irqsave(&rs->rs_recv_lock, flags);
        if (!rs->rs_cong_monitor) {
@@ -193,17 +176,13 @@ static unsigned int rds_poll(struct file *file, struct socket *sock,
                        mask |= (POLLIN | POLLRDNORM);
                spin_unlock(&rs->rs_lock);
        }
-       if (!list_empty(&rs->rs_recv_queue) ||
-           !list_empty(&rs->rs_notify_queue))
+       if (!list_empty(&rs->rs_recv_queue)
+        || !list_empty(&rs->rs_notify_queue))
                mask |= (POLLIN | POLLRDNORM);
        if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
                mask |= (POLLOUT | POLLWRNORM);
        read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 
-       /* clear state any time we wake a seen-congested socket */
-       if (mask)
-               rs->rs_seen_congestion = 0;
-
        return mask;
 }
 
@@ -271,7 +250,7 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
 }
 
 static int rds_setsockopt(struct socket *sock, int level, int optname,
-                         char __user *optval, unsigned int optlen)
+                         char __user *optval, int optlen)
 {
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        int ret;
@@ -288,9 +267,6 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
        case RDS_GET_MR:
                ret = rds_get_mr(rs, optval, optlen);
                break;
-       case RDS_GET_MR_FOR_DEST:
-               ret = rds_get_mr_for_dest(rs, optval, optlen);
-               break;
        case RDS_FREE_MR:
                ret = rds_free_mr(rs, optval, optlen);
                break;
@@ -331,8 +307,8 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
                if (len < sizeof(int))
                        ret = -EINVAL;
                else
-               if (put_user(rs->rs_recverr, (int __user *) optval) ||
-                   put_user(sizeof(int), optlen))
+               if (put_user(rs->rs_recverr, (int __user *) optval)
+                || put_user(sizeof(int), optlen))
                        ret = -EFAULT;
                else
                        ret = 0;
@@ -385,7 +361,7 @@ static struct proto rds_proto = {
        .obj_size = sizeof(struct rds_sock),
 };
 
-static const struct proto_ops rds_proto_ops = {
+static struct proto_ops rds_proto_ops = {
        .family =       AF_RDS,
        .owner =        THIS_MODULE,
        .release =      rds_release,
@@ -408,6 +384,7 @@ static const struct proto_ops rds_proto_ops = {
 
 static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 {
+       unsigned long flags;
        struct rds_sock *rs;
 
        sock_init_data(sock, sk);
@@ -424,16 +401,15 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
        spin_lock_init(&rs->rs_rdma_lock);
        rs->rs_rdma_keys = RB_ROOT;
 
-       spin_lock_bh(&rds_sock_lock);
+       spin_lock_irqsave(&rds_sock_lock, flags);
        list_add_tail(&rs->rs_item, &rds_sock_list);
        rds_sock_count++;
-       spin_unlock_bh(&rds_sock_lock);
+       spin_unlock_irqrestore(&rds_sock_lock, flags);
 
        return 0;
 }
 
-static int rds_create(struct net *net, struct socket *sock, int protocol,
-                     int kern)
+static int rds_create(struct net *net, struct socket *sock, int protocol)
 {
        struct sock *sk;
 
@@ -457,7 +433,7 @@ void rds_sock_put(struct rds_sock *rs)
        sock_put(rds_rs_to_sk(rs));
 }
 
-static const struct net_proto_family rds_family_ops = {
+static struct net_proto_family rds_family_ops = {
        .family =       AF_RDS,
        .create =       rds_create,
        .owner  =       THIS_MODULE,
@@ -468,14 +444,17 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
                              struct rds_info_lengths *lens)
 {
        struct rds_sock *rs;
+       struct sock *sk;
        struct rds_incoming *inc;
+       unsigned long flags;
        unsigned int total = 0;
 
        len /= sizeof(struct rds_info_message);
 
-       spin_lock_bh(&rds_sock_lock);
+       spin_lock_irqsave(&rds_sock_lock, flags);
 
        list_for_each_entry(rs, &rds_sock_list, rs_item) {
+               sk = rds_rs_to_sk(rs);
                read_lock(&rs->rs_recv_lock);
 
                /* XXX too lazy to maintain counts.. */
@@ -489,7 +468,7 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
                read_unlock(&rs->rs_recv_lock);
        }
 
-       spin_unlock_bh(&rds_sock_lock);
+       spin_unlock_irqrestore(&rds_sock_lock, flags);
 
        lens->nr = total;
        lens->each = sizeof(struct rds_info_message);
@@ -501,10 +480,11 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
 {
        struct rds_info_socket sinfo;
        struct rds_sock *rs;
+       unsigned long flags;
 
        len /= sizeof(struct rds_info_socket);
 
-       spin_lock_bh(&rds_sock_lock);
+       spin_lock_irqsave(&rds_sock_lock, flags);
 
        if (len < rds_sock_count)
                goto out;
@@ -525,11 +505,12 @@ out:
        lens->nr = rds_sock_count;
        lens->each = sizeof(struct rds_info_socket);
 
-       spin_unlock_bh(&rds_sock_lock);
+       spin_unlock_irqrestore(&rds_sock_lock, flags);
 }
 
-static void rds_exit(void)
+static void __exit rds_exit(void)
 {
+       rds_rdma_exit();
        sock_unregister(rds_family_ops.family);
        proto_unregister(&rds_proto);
        rds_conn_exit();
@@ -543,7 +524,7 @@ static void rds_exit(void)
 }
 module_exit(rds_exit);
 
-static int rds_init(void)
+static int __init rds_init(void)
 {
        int ret;
 
@@ -569,8 +550,14 @@ static int rds_init(void)
        rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
        rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
 
+       /* ib/iwarp transports currently compiled-in */
+       ret = rds_rdma_init();
+       if (ret)
+               goto out_sock;
        goto out;
 
+out_sock:
+       sock_unregister(rds_family_ops.family);
 out_proto:
        proto_unregister(&rds_proto);
 out_stats:
index a2e6562da751f79bd9524c7e434f1d1d02d3b4fe..c17cc39160cefd86d44a8787a83824e5f06ce660 100644 (file)
 #include <net/sock.h>
 #include <linux/in.h>
 #include <linux/if_arp.h>
-#include <linux/jhash.h>
-#include <linux/ratelimit.h>
 #include "rds.h"
 
-#define BIND_HASH_SIZE 1024
-static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
+/*
+ * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
+ * particularly zippy.
+ *
+ * This is now called for every incoming frame so we arguably care much more
+ * about it than we used to.
+ */
 static DEFINE_SPINLOCK(rds_bind_lock);
+static struct rb_root rds_bind_tree = RB_ROOT;
 
-static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
-{
-       return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
-                                 (BIND_HASH_SIZE - 1));
-}
-
-static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
-                                       struct rds_sock *insert)
+static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
+                                          struct rds_sock *insert)
 {
+       struct rb_node **p = &rds_bind_tree.rb_node;
+       struct rb_node *parent = NULL;
        struct rds_sock *rs;
-       struct hlist_head *head = hash_to_bucket(addr, port);
        u64 cmp;
        u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
 
-       rcu_read_lock();
-       hlist_for_each_entry_rcu(rs, head, rs_bound_node) {
+       while (*p) {
+               parent = *p;
+               rs = rb_entry(parent, struct rds_sock, rs_bound_node);
+
                cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
                      be16_to_cpu(rs->rs_bound_port);
 
-               if (cmp == needle) {
-                       rcu_read_unlock();
+               if (needle < cmp)
+                       p = &(*p)->rb_left;
+               else if (needle > cmp)
+                       p = &(*p)->rb_right;
+               else
                        return rs;
-               }
        }
-       rcu_read_unlock();
 
        if (insert) {
-               /*
-                * make sure our addr and port are set before
-                * we are added to the list, other people
-                * in rcu will find us as soon as the
-                * hlist_add_head_rcu is done
-                */
-               insert->rs_bound_addr = addr;
-               insert->rs_bound_port = port;
-               rds_sock_addref(insert);
-
-               hlist_add_head_rcu(&insert->rs_bound_node, head);
+               rb_link_node(&insert->rs_bound_node, parent, p);
+               rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
        }
        return NULL;
 }
@@ -93,13 +86,15 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
 struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
 {
        struct rds_sock *rs;
+       unsigned long flags;
 
-       rs = rds_bind_lookup(addr, port, NULL);
-
+       spin_lock_irqsave(&rds_bind_lock, flags);
+       rs = rds_bind_tree_walk(addr, port, NULL);
        if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
                rds_sock_addref(rs);
        else
                rs = NULL;
+       spin_unlock_irqrestore(&rds_bind_lock, flags);
 
        rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
                ntohs(port));
@@ -117,7 +112,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
                rover = be16_to_cpu(*port);
                last = rover;
        } else {
-               rover = max_t(u16, prandom_u32(), 2);
+               rover = max_t(u16, net_random(), 2);
                last = rover - 1;
        }
 
@@ -126,15 +121,22 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
        do {
                if (rover == 0)
                        rover++;
-               if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
-                       *port = rs->rs_bound_port;
+               if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
+                       *port = cpu_to_be16(rover);
                        ret = 0;
-                       rdsdebug("rs %p binding to %pI4:%d\n",
-                         rs, &addr, (int)ntohs(*port));
                        break;
                }
        } while (rover++ != last);
 
+       if (ret == 0)  {
+               rs->rs_bound_addr = addr;
+               rs->rs_bound_port = *port;
+               rds_sock_addref(rs);
+
+               rdsdebug("rs %p binding to %pI4:%d\n",
+                 rs, &addr, (int)ntohs(*port));
+       }
+
        spin_unlock_irqrestore(&rds_bind_lock, flags);
 
        return ret;
@@ -151,7 +153,7 @@ void rds_remove_bound(struct rds_sock *rs)
                  rs, &rs->rs_bound_addr,
                  ntohs(rs->rs_bound_port));
 
-               hlist_del_init_rcu(&rs->rs_bound_node);
+               rb_erase(&rs->rs_bound_node, &rds_bind_tree);
                rds_sock_put(rs);
                rs->rs_bound_addr = 0;
        }
@@ -182,11 +184,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
                goto out;
 
        trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
-       if (!trans) {
+       if (trans == NULL) {
                ret = -EADDRNOTAVAIL;
                rds_remove_bound(rs);
-               printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
-                               "load rds_tcp or rds_rdma?\n");
                goto out;
        }
 
@@ -195,9 +195,5 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 out:
        release_sock(sk);
-
-       /* we might have called rds_remove_bound on error */
-       if (ret)
-               synchronize_rcu();
        return ret;
 }
index e6144b8246fd27fe49bffd228b44a44c3e7cbd81..710e4599d76cffa7a56ad2891c127960b37cb3c0 100644 (file)
  * SOFTWARE.
  *
  */
-#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/rbtree.h>
-#include <linux/bitops.h>
-#include <linux/export.h>
+
+#include <asm-generic/bitops/le.h>
 
 #include "rds.h"
 
@@ -141,7 +140,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
        unsigned long flags;
 
        map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
-       if (!map)
+       if (map == NULL)
                return NULL;
 
        map->m_addr = addr;
@@ -159,7 +158,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
        ret = rds_cong_tree_walk(addr, map);
        spin_unlock_irqrestore(&rds_cong_lock, flags);
 
-       if (!ret) {
+       if (ret == NULL) {
                ret = map;
                map = NULL;
        }
@@ -205,7 +204,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
        conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
        conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
 
-       if (!(conn->c_lcong && conn->c_fcong))
+       if (conn->c_lcong == NULL || conn->c_fcong == NULL)
                return -ENOMEM;
 
        return 0;
@@ -221,20 +220,6 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
        list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
                if (!test_and_set_bit(0, &conn->c_map_queued)) {
                        rds_stats_inc(s_cong_update_queued);
-                       /* We cannot inline the call to rds_send_xmit() here
-                        * for two reasons (both pertaining to a TCP transport):
-                        * 1. When we get here from the receive path, we
-                        *    are already holding the sock_lock (held by
-                        *    tcp_v4_rcv()). So inlining calls to
-                        *    tcp_setsockopt and/or tcp_sendmsg will deadlock
-                        *    when it tries to get the sock_lock())
-                        * 2. Interrupts are masked so that we can mark the
-                        *    the port congested from both send and recv paths.
-                        *    (See comment around declaration of rdc_cong_lock).
-                        *    An attempt to get the sock_lock() here will
-                        *    therefore trigger warnings.
-                        * Defer the xmit to rds_send_worker() instead.
-                        */
                        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
                }
        }
@@ -269,7 +254,6 @@ void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
                read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
        }
 }
-EXPORT_SYMBOL_GPL(rds_cong_map_updated);
 
 int rds_cong_updated_since(unsigned long *recent)
 {
@@ -299,7 +283,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
        i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
        off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
 
-       __set_bit_le(off, (void *)map->m_page_addrs[i]);
+       generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
 }
 
 void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
@@ -313,7 +297,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
        i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
        off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
 
-       __clear_bit_le(off, (void *)map->m_page_addrs[i]);
+       generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
 }
 
 static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
@@ -324,7 +308,7 @@ static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
        i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
        off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
 
-       return test_bit_le(off, (void *)map->m_page_addrs[i]);
+       return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
 }
 
 void rds_cong_add_socket(struct rds_sock *rs)
index da6da57e5f36b5cc13a5bc92abfedb6a5ccea45d..273f064930a8e1aec7f8539ed54d928447e7a53a 100644 (file)
  */
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/export.h>
 #include <net/inet_hashtables.h>
 
 #include "rds.h"
 #include "loop.h"
+#include "rdma.h"
 
 #define RDS_CONNECTION_HASH_BITS 12
 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
@@ -51,16 +50,10 @@ static struct kmem_cache *rds_conn_slab;
 
 static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 {
-       static u32 rds_hash_secret __read_mostly;
-
-       unsigned long hash;
-
-       net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
-
        /* Pass NULL, don't need struct net for hash */
-       hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
-                             be32_to_cpu(faddr), 0,
-                             rds_hash_secret);
+       unsigned long hash = inet_ehashfn(NULL,
+                                         be32_to_cpu(laddr), 0,
+                                         be32_to_cpu(faddr), 0);
        return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
 }
 
@@ -69,14 +62,26 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
                var |= RDS_INFO_CONNECTION_FLAG_##suffix;       \
 } while (0)
 
-/* rcu read lock must be held or the connection spinlock */
+static inline int rds_conn_is_sending(struct rds_connection *conn)
+{
+       int ret = 0;
+
+       if (!mutex_trylock(&conn->c_send_lock))
+               ret = 1;
+       else
+               mutex_unlock(&conn->c_send_lock);
+
+       return ret;
+}
+
 static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
                                              __be32 laddr, __be32 faddr,
                                              struct rds_transport *trans)
 {
        struct rds_connection *conn, *ret = NULL;
+       struct hlist_node *pos;
 
-       hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+       hlist_for_each_entry(conn, pos, head, c_hash_node) {
                if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
                                conn->c_trans == trans) {
                        ret = conn;
@@ -94,7 +99,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
  * and receiving over this connection again in the future.  It is up to
  * the transport to have serialized this call with its send and recv.
  */
-static void rds_conn_reset(struct rds_connection *conn)
+void rds_conn_reset(struct rds_connection *conn)
 {
        rdsdebug("connection %pI4 to %pI4 reset\n",
          &conn->c_laddr, &conn->c_faddr);
@@ -121,19 +126,17 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
                                       struct rds_transport *trans, gfp_t gfp,
                                       int is_outgoing)
 {
-       struct rds_connection *conn, *parent = NULL;
+       struct rds_connection *conn, *tmp, *parent = NULL;
        struct hlist_head *head = rds_conn_bucket(laddr, faddr);
-       struct rds_transport *loop_trans;
        unsigned long flags;
        int ret;
-       struct rds_transport *otrans = trans;
 
-       if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
-               goto new_conn;
-       rcu_read_lock();
+       spin_lock_irqsave(&rds_conn_lock, flags);
        conn = rds_conn_lookup(head, laddr, faddr, trans);
-       if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
-           laddr == faddr && !is_outgoing) {
+       if (conn
+        && conn->c_loopback
+        && conn->c_trans != &rds_loop_transport
+        && !is_outgoing) {
                /* This is a looped back IB connection, and we're
                 * called by the code handling the incoming connect.
                 * We need a second connection object into which we
@@ -141,24 +144,26 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
                parent = conn;
                conn = parent->c_passive;
        }
-       rcu_read_unlock();
+       spin_unlock_irqrestore(&rds_conn_lock, flags);
        if (conn)
                goto out;
 
-new_conn:
-       conn = kmem_cache_zalloc(rds_conn_slab, gfp);
-       if (!conn) {
+       conn = kmem_cache_alloc(rds_conn_slab, gfp);
+       if (conn == NULL) {
                conn = ERR_PTR(-ENOMEM);
                goto out;
        }
 
+       memset(conn, 0, sizeof(*conn));
+
        INIT_HLIST_NODE(&conn->c_hash_node);
+       conn->c_version = RDS_PROTOCOL_3_0;
        conn->c_laddr = laddr;
        conn->c_faddr = faddr;
        spin_lock_init(&conn->c_lock);
        conn->c_next_tx_seq = 1;
 
-       init_waitqueue_head(&conn->c_waitq);
+       mutex_init(&conn->c_send_lock);
        INIT_LIST_HEAD(&conn->c_send_queue);
        INIT_LIST_HEAD(&conn->c_retrans);
 
@@ -174,9 +179,7 @@ new_conn:
         * can bind to the destination address then we'd rather the messages
         * flow through loopback rather than either transport.
         */
-       loop_trans = rds_trans_get_preferred(faddr);
-       if (loop_trans) {
-               rds_trans_put(loop_trans);
+       if (rds_trans_get_preferred(faddr)) {
                conn->c_loopback = 1;
                if (is_outgoing && trans->t_prefer_loopback) {
                        /* "outgoing" connection - and the transport
@@ -197,7 +200,6 @@ new_conn:
        }
 
        atomic_set(&conn->c_state, RDS_CONN_DOWN);
-       conn->c_send_gen = 0;
        conn->c_reconnect_jiffies = 0;
        INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
        INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
@@ -211,49 +213,26 @@ new_conn:
          trans->t_name ? trans->t_name : "[unknown]",
          is_outgoing ? "(outgoing)" : "");
 
-       /*
-        * Since we ran without holding the conn lock, someone could
-        * have created the same conn (either normal or passive) in the
-        * interim. We check while holding the lock. If we won, we complete
-        * init and return our conn. If we lost, we rollback and return the
-        * other one.
-        */
        spin_lock_irqsave(&rds_conn_lock, flags);
-       if (parent) {
-               /* Creating passive conn */
-               if (parent->c_passive) {
-                       trans->conn_free(conn->c_transport_data);
-                       kmem_cache_free(rds_conn_slab, conn);
-                       conn = parent->c_passive;
-               } else {
+       if (parent == NULL) {
+               tmp = rds_conn_lookup(head, laddr, faddr, trans);
+               if (tmp == NULL)
+                       hlist_add_head(&conn->c_hash_node, head);
+       } else {
+               tmp = parent->c_passive;
+               if (!tmp)
                        parent->c_passive = conn;
-                       rds_cong_add_conn(conn);
-                       rds_conn_count++;
-               }
+       }
+
+       if (tmp) {
+               trans->conn_free(conn->c_transport_data);
+               kmem_cache_free(rds_conn_slab, conn);
+               conn = tmp;
        } else {
-               /* Creating normal conn */
-               struct rds_connection *found;
-
-               if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
-                       found = NULL;
-               else
-                       found = rds_conn_lookup(head, laddr, faddr, trans);
-               if (found) {
-                       trans->conn_free(conn->c_transport_data);
-                       kmem_cache_free(rds_conn_slab, conn);
-                       conn = found;
-               } else {
-                       if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
-                           (otrans->t_type != RDS_TRANS_TCP)) {
-                               /* Only the active side should be added to
-                                * reconnect list for TCP.
-                                */
-                               hlist_add_head_rcu(&conn->c_hash_node, head);
-                       }
-                       rds_cong_add_conn(conn);
-                       rds_conn_count++;
-               }
+               rds_cong_add_conn(conn);
+               rds_conn_count++;
        }
+
        spin_unlock_irqrestore(&rds_conn_lock, flags);
 
 out:
@@ -265,100 +244,28 @@ struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
 {
        return __rds_conn_create(laddr, faddr, trans, gfp, 0);
 }
-EXPORT_SYMBOL_GPL(rds_conn_create);
 
 struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
                                       struct rds_transport *trans, gfp_t gfp)
 {
        return __rds_conn_create(laddr, faddr, trans, gfp, 1);
 }
-EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
-
-void rds_conn_shutdown(struct rds_connection *conn)
-{
-       /* shut it down unless it's down already */
-       if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
-               /*
-                * Quiesce the connection mgmt handlers before we start tearing
-                * things down. We don't hold the mutex for the entire
-                * duration of the shutdown operation, else we may be
-                * deadlocking with the CM handler. Instead, the CM event
-                * handler is supposed to check for state DISCONNECTING
-                */
-               mutex_lock(&conn->c_cm_lock);
-               if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
-                && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
-                       rds_conn_error(conn, "shutdown called in state %d\n",
-                                       atomic_read(&conn->c_state));
-                       mutex_unlock(&conn->c_cm_lock);
-                       return;
-               }
-               mutex_unlock(&conn->c_cm_lock);
-
-               wait_event(conn->c_waitq,
-                          !test_bit(RDS_IN_XMIT, &conn->c_flags));
-
-               conn->c_trans->conn_shutdown(conn);
-               rds_conn_reset(conn);
-
-               if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
-                       /* This can happen - eg when we're in the middle of tearing
-                        * down the connection, and someone unloads the rds module.
-                        * Quite reproduceable with loopback connections.
-                        * Mostly harmless.
-                        */
-                       rds_conn_error(conn,
-                               "%s: failed to transition to state DOWN, "
-                               "current state is %d\n",
-                               __func__,
-                               atomic_read(&conn->c_state));
-                       return;
-               }
-       }
-
-       /* Then reconnect if it's still live.
-        * The passive side of an IB loopback connection is never added
-        * to the conn hash, so we never trigger a reconnect on this
-        * conn - the reconnect is always triggered by the active peer. */
-       cancel_delayed_work_sync(&conn->c_conn_w);
-       rcu_read_lock();
-       if (!hlist_unhashed(&conn->c_hash_node)) {
-               rcu_read_unlock();
-               rds_queue_reconnect(conn);
-       } else {
-               rcu_read_unlock();
-       }
-}
 
-/*
- * Stop and free a connection.
- *
- * This can only be used in very limited circumstances.  It assumes that once
- * the conn has been shutdown that no one else is referencing the connection.
- * We can only ensure this in the rmmod path in the current code.
- */
 void rds_conn_destroy(struct rds_connection *conn)
 {
        struct rds_message *rm, *rtmp;
-       unsigned long flags;
 
        rdsdebug("freeing conn %p for %pI4 -> "
                 "%pI4\n", conn, &conn->c_laddr,
                 &conn->c_faddr);
 
-       /* Ensure conn will not be scheduled for reconnect */
-       spin_lock_irq(&rds_conn_lock);
-       hlist_del_init_rcu(&conn->c_hash_node);
-       spin_unlock_irq(&rds_conn_lock);
-       synchronize_rcu();
-
-       /* shut the connection down */
-       rds_conn_drop(conn);
-       flush_work(&conn->c_down_w);
+       hlist_del_init(&conn->c_hash_node);
 
-       /* make sure lingering queued work won't try to ref the conn */
-       cancel_delayed_work_sync(&conn->c_send_w);
-       cancel_delayed_work_sync(&conn->c_recv_w);
+       /* wait for the rds thread to shut it down */
+       atomic_set(&conn->c_state, RDS_CONN_ERROR);
+       cancel_delayed_work(&conn->c_conn_w);
+       queue_work(rds_wq, &conn->c_down_w);
+       flush_workqueue(rds_wq);
 
        /* tear down queued messages */
        list_for_each_entry_safe(rm, rtmp,
@@ -383,11 +290,8 @@ void rds_conn_destroy(struct rds_connection *conn)
        BUG_ON(!list_empty(&conn->c_retrans));
        kmem_cache_free(rds_conn_slab, conn);
 
-       spin_lock_irqsave(&rds_conn_lock, flags);
        rds_conn_count--;
-       spin_unlock_irqrestore(&rds_conn_lock, flags);
 }
-EXPORT_SYMBOL_GPL(rds_conn_destroy);
 
 static void rds_conn_message_info(struct socket *sock, unsigned int len,
                                  struct rds_info_iterator *iter,
@@ -395,26 +299,27 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
                                  int want_send)
 {
        struct hlist_head *head;
+       struct hlist_node *pos;
        struct list_head *list;
        struct rds_connection *conn;
        struct rds_message *rm;
-       unsigned int total = 0;
        unsigned long flags;
+       unsigned int total = 0;
        size_t i;
 
        len /= sizeof(struct rds_info_message);
 
-       rcu_read_lock();
+       spin_lock_irqsave(&rds_conn_lock, flags);
 
        for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
             i++, head++) {
-               hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+               hlist_for_each_entry(conn, pos, head, c_hash_node) {
                        if (want_send)
                                list = &conn->c_send_queue;
                        else
                                list = &conn->c_retrans;
 
-                       spin_lock_irqsave(&conn->c_lock, flags);
+                       spin_lock(&conn->c_lock);
 
                        /* XXX too lazy to maintain counts.. */
                        list_for_each_entry(rm, list, m_conn_item) {
@@ -425,10 +330,11 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
                                                          conn->c_faddr, 0);
                        }
 
-                       spin_unlock_irqrestore(&conn->c_lock, flags);
+                       spin_unlock(&conn->c_lock);
                }
        }
-       rcu_read_unlock();
+
+       spin_unlock_irqrestore(&rds_conn_lock, flags);
 
        lens->nr = total;
        lens->each = sizeof(struct rds_info_message);
@@ -457,17 +363,20 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 {
        uint64_t buffer[(item_len + 7) / 8];
        struct hlist_head *head;
+       struct hlist_node *pos;
+       struct hlist_node *tmp;
        struct rds_connection *conn;
+       unsigned long flags;
        size_t i;
 
-       rcu_read_lock();
+       spin_lock_irqsave(&rds_conn_lock, flags);
 
        lens->nr = 0;
        lens->each = item_len;
 
        for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
             i++, head++) {
-               hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+               hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
 
                        /* XXX no c_lock usage.. */
                        if (!visitor(conn, buffer))
@@ -483,9 +392,9 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
                        lens->nr++;
                }
        }
-       rcu_read_unlock();
+
+       spin_unlock_irqrestore(&rds_conn_lock, flags);
 }
-EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
 
 static int rds_conn_info_visitor(struct rds_connection *conn,
                                  void *buffer)
@@ -500,8 +409,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
                sizeof(cinfo->transport));
        cinfo->flags = 0;
 
-       rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
-                         SENDING);
+       rds_conn_info_set(cinfo->flags,
+                         rds_conn_is_sending(conn), SENDING);
        /* XXX Future: return the state rather than these funky bits */
        rds_conn_info_set(cinfo->flags,
                          atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
@@ -521,12 +430,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
                                sizeof(struct rds_info_connection));
 }
 
-int rds_conn_init(void)
+int __init rds_conn_init(void)
 {
        rds_conn_slab = kmem_cache_create("rds_connection",
                                          sizeof(struct rds_connection),
                                          0, 0, NULL);
-       if (!rds_conn_slab)
+       if (rds_conn_slab == NULL)
                return -ENOMEM;
 
        rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
@@ -561,19 +470,6 @@ void rds_conn_drop(struct rds_connection *conn)
        atomic_set(&conn->c_state, RDS_CONN_ERROR);
        queue_work(rds_wq, &conn->c_down_w);
 }
-EXPORT_SYMBOL_GPL(rds_conn_drop);
-
-/*
- * If the connection is down, trigger a connect. We may have scheduled a
- * delayed reconnect however - in this case we should not interfere.
- */
-void rds_conn_connect_if_down(struct rds_connection *conn)
-{
-       if (rds_conn_state(conn) == RDS_CONN_DOWN &&
-           !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
-               queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
-}
-EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
 
 /*
  * An error occurred on the connection
index ba2dffeff60876ca669993d1863dcbb6cb76a740..4933b380985eb730b496dd21152dc470fff7a10c 100644 (file)
 #include <linux/inetdevice.h>
 #include <linux/if_arp.h>
 #include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
 
 #include "rds.h"
 #include "ib.h"
 
-static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
 unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
-unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
 
 module_param(fmr_pool_size, int, 0444);
 MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
 module_param(fmr_message_size, int, 0444);
 MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
-module_param(rds_ib_retry_count, int, 0444);
-MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
 
-/*
- * we have a clumsy combination of RCU and a rwsem protecting this list
- * because it is used both in the get_mr fast path and while blocking in
- * the FMR flushing path.
- */
-DECLARE_RWSEM(rds_ib_devices_lock);
 struct list_head rds_ib_devices;
 
 /* NOTE: if also grabbing ibdev lock, grab this first */
 DEFINE_SPINLOCK(ib_nodev_conns_lock);
 LIST_HEAD(ib_nodev_conns);
 
-static void rds_ib_nodev_connect(void)
-{
-       struct rds_ib_connection *ic;
-
-       spin_lock(&ib_nodev_conns_lock);
-       list_for_each_entry(ic, &ib_nodev_conns, ib_node)
-               rds_conn_connect_if_down(ic->conn);
-       spin_unlock(&ib_nodev_conns_lock);
-}
-
-static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
-{
-       struct rds_ib_connection *ic;
-       unsigned long flags;
-
-       spin_lock_irqsave(&rds_ibdev->spinlock, flags);
-       list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
-               rds_conn_drop(ic->conn);
-       spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
-}
-
-/*
- * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
- * from interrupt context so we push freing off into a work struct in krdsd.
- */
-static void rds_ib_dev_free(struct work_struct *work)
-{
-       struct rds_ib_ipaddr *i_ipaddr, *i_next;
-       struct rds_ib_device *rds_ibdev = container_of(work,
-                                       struct rds_ib_device, free_work);
-
-       if (rds_ibdev->mr_pool)
-               rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
-       if (rds_ibdev->mr)
-               ib_dereg_mr(rds_ibdev->mr);
-       if (rds_ibdev->pd)
-               ib_dealloc_pd(rds_ibdev->pd);
-
-       list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
-               list_del(&i_ipaddr->list);
-               kfree(i_ipaddr);
-       }
-
-       kfree(rds_ibdev);
-}
-
-void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
-{
-       BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
-       if (atomic_dec_and_test(&rds_ibdev->refcount))
-               queue_work(rds_wq, &rds_ibdev->free_work);
-}
-
-static void rds_ib_add_one(struct ib_device *device)
+void rds_ib_add_one(struct ib_device *device)
 {
        struct rds_ib_device *rds_ibdev;
        struct ib_device_attr *dev_attr;
@@ -137,124 +73,85 @@ static void rds_ib_add_one(struct ib_device *device)
                goto free_attr;
        }
 
-       rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
-                                ibdev_to_node(device));
+       rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
        if (!rds_ibdev)
                goto free_attr;
 
        spin_lock_init(&rds_ibdev->spinlock);
-       atomic_set(&rds_ibdev->refcount, 1);
-       INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
 
        rds_ibdev->max_wrs = dev_attr->max_qp_wr;
        rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
 
+       rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
+       rds_ibdev->fmr_page_size  = 1 << rds_ibdev->fmr_page_shift;
+       rds_ibdev->fmr_page_mask  = ~((u64) rds_ibdev->fmr_page_size - 1);
        rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
        rds_ibdev->max_fmrs = dev_attr->max_fmr ?
                        min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
                        fmr_pool_size;
 
-       rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
-       rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
-
        rds_ibdev->dev = device;
        rds_ibdev->pd = ib_alloc_pd(device);
-       if (IS_ERR(rds_ibdev->pd)) {
-               rds_ibdev->pd = NULL;
-               goto put_dev;
-       }
+       if (IS_ERR(rds_ibdev->pd))
+               goto free_dev;
 
-       rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(rds_ibdev->mr)) {
-               rds_ibdev->mr = NULL;
-               goto put_dev;
-       }
+       rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+                                     IB_ACCESS_LOCAL_WRITE);
+       if (IS_ERR(rds_ibdev->mr))
+               goto err_pd;
 
        rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
        if (IS_ERR(rds_ibdev->mr_pool)) {
                rds_ibdev->mr_pool = NULL;
-               goto put_dev;
+               goto err_mr;
        }
 
        INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
        INIT_LIST_HEAD(&rds_ibdev->conn_list);
-
-       down_write(&rds_ib_devices_lock);
-       list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
-       up_write(&rds_ib_devices_lock);
-       atomic_inc(&rds_ibdev->refcount);
+       list_add_tail(&rds_ibdev->list, &rds_ib_devices);
 
        ib_set_client_data(device, &rds_ib_client, rds_ibdev);
-       atomic_inc(&rds_ibdev->refcount);
 
-       rds_ib_nodev_connect();
+       goto free_attr;
 
-put_dev:
-       rds_ib_dev_put(rds_ibdev);
+err_mr:
+       ib_dereg_mr(rds_ibdev->mr);
+err_pd:
+       ib_dealloc_pd(rds_ibdev->pd);
+free_dev:
+       kfree(rds_ibdev);
 free_attr:
        kfree(dev_attr);
 }
 
-/*
- * New connections use this to find the device to associate with the
- * connection.  It's not in the fast path so we're not concerned about the
- * performance of the IB call.  (As of this writing, it uses an interrupt
- * blocking spinlock to serialize walking a per-device list of all registered
- * clients.)
- *
- * RCU is used to handle incoming connections racing with device teardown.
- * Rather than use a lock to serialize removal from the client_data and
- * getting a new reference, we use an RCU grace period.  The destruction
- * path removes the device from client_data and then waits for all RCU
- * readers to finish.
- *
- * A new connection can get NULL from this if its arriving on a
- * device that is in the process of being removed.
- */
-struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
-{
-       struct rds_ib_device *rds_ibdev;
-
-       rcu_read_lock();
-       rds_ibdev = ib_get_client_data(device, &rds_ib_client);
-       if (rds_ibdev)
-               atomic_inc(&rds_ibdev->refcount);
-       rcu_read_unlock();
-       return rds_ibdev;
-}
-
-/*
- * The IB stack is letting us know that a device is going away.  This can
- * happen if the underlying HCA driver is removed or if PCI hotplug is removing
- * the pci function, for example.
- *
- * This can be called at any time and can be racing with any other RDS path.
- */
-static void rds_ib_remove_one(struct ib_device *device)
+void rds_ib_remove_one(struct ib_device *device)
 {
        struct rds_ib_device *rds_ibdev;
+       struct rds_ib_ipaddr *i_ipaddr, *i_next;
 
        rds_ibdev = ib_get_client_data(device, &rds_ib_client);
        if (!rds_ibdev)
                return;
 
-       rds_ib_dev_shutdown(rds_ibdev);
+       list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+               list_del(&i_ipaddr->list);
+               kfree(i_ipaddr);
+       }
+
+       rds_ib_destroy_conns(rds_ibdev);
 
-       /* stop connection attempts from getting a reference to this device. */
-       ib_set_client_data(device, &rds_ib_client, NULL);
+       if (rds_ibdev->mr_pool)
+               rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
 
-       down_write(&rds_ib_devices_lock);
-       list_del_rcu(&rds_ibdev->list);
-       up_write(&rds_ib_devices_lock);
+       ib_dereg_mr(rds_ibdev->mr);
 
-       /*
-        * This synchronize rcu is waiting for readers of both the ib
-        * client data and the devices list to finish before we drop
-        * both of those references.
-        */
-       synchronize_rcu();
-       rds_ib_dev_put(rds_ibdev);
-       rds_ib_dev_put(rds_ibdev);
+       while (ib_dealloc_pd(rds_ibdev->pd)) {
+               rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
+               msleep(1);
+       }
+
+       list_del(&rds_ibdev->list);
+       kfree(rds_ibdev);
 }
 
 struct ib_client rds_ib_client = {
@@ -285,10 +182,10 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
                ic = conn->c_transport_data;
                dev_addr = &ic->i_cm_id->route.addr.dev_addr;
 
-               rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
-               rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+               ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+               ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
 
-               rds_ibdev = ic->rds_ibdev;
+               rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
                iinfo->max_send_wr = ic->i_send_ring.w_nr;
                iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
                iinfo->max_send_sge = rds_ibdev->max_sge;
@@ -326,9 +223,9 @@ static int rds_ib_laddr_check(__be32 addr)
        /* Create a CMA ID and try to bind it. This catches both
         * IB and iWARP capable NICs.
         */
-       cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
-       if (IS_ERR(cm_id))
-               return PTR_ERR(cm_id);
+       cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+       if (!cm_id)
+               return -EADDRNOTAVAIL;
 
        memset(&sin, 0, sizeof(sin));
        sin.sin_family = AF_INET;
@@ -338,8 +235,7 @@ static int rds_ib_laddr_check(__be32 addr)
        ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
        /* due to this, we will claim to support iWARP devices unless we
           check node_type. */
-       if (ret || !cm_id->device ||
-           cm_id->device->node_type != RDMA_NODE_IB_CA)
+       if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
                ret = -EADDRNOTAVAIL;
 
        rdsdebug("addr %pI4 ret %d node type %d\n",
@@ -351,18 +247,11 @@ static int rds_ib_laddr_check(__be32 addr)
        return ret;
 }
 
-static void rds_ib_unregister_client(void)
-{
-       ib_unregister_client(&rds_ib_client);
-       /* wait for rds_ib_dev_free() to complete */
-       flush_workqueue(rds_wq);
-}
-
 void rds_ib_exit(void)
 {
        rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
-       rds_ib_unregister_client();
        rds_ib_destroy_nodev_conns();
+       ib_unregister_client(&rds_ib_client);
        rds_ib_sysctl_exit();
        rds_ib_recv_exit();
        rds_trans_unregister(&rds_ib_transport);
@@ -372,14 +261,15 @@ struct rds_transport rds_ib_transport = {
        .laddr_check            = rds_ib_laddr_check,
        .xmit_complete          = rds_ib_xmit_complete,
        .xmit                   = rds_ib_xmit,
+       .xmit_cong_map          = NULL,
        .xmit_rdma              = rds_ib_xmit_rdma,
-       .xmit_atomic            = rds_ib_xmit_atomic,
        .recv                   = rds_ib_recv,
        .conn_alloc             = rds_ib_conn_alloc,
        .conn_free              = rds_ib_conn_free,
        .conn_connect           = rds_ib_conn_connect,
        .conn_shutdown          = rds_ib_conn_shutdown,
        .inc_copy_to_user       = rds_ib_inc_copy_to_user,
+       .inc_purge              = rds_ib_inc_purge,
        .inc_free               = rds_ib_inc_free,
        .cm_initiate_connect    = rds_ib_cm_initiate_connect,
        .cm_handle_connect      = rds_ib_cm_handle_connect,
@@ -392,10 +282,9 @@ struct rds_transport rds_ib_transport = {
        .flush_mrs              = rds_ib_flush_mrs,
        .t_owner                = THIS_MODULE,
        .t_name                 = "infiniband",
-       .t_type                 = RDS_TRANS_IB
 };
 
-int rds_ib_init(void)
+int __init rds_ib_init(void)
 {
        int ret;
 
@@ -426,7 +315,7 @@ out_recv:
 out_sysctl:
        rds_ib_sysctl_exit();
 out_ibreg:
-       rds_ib_unregister_client();
+       ib_unregister_client(&rds_ib_client);
 out:
        return ret;
 }
index c36d713229e0f5c5a1b43fe227a1e04480ba100d..069206cae733c3a9a88b5efda1b6478a92355b85 100644 (file)
@@ -3,14 +3,11 @@
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
 #include "rds.h"
 #include "rdma_transport.h"
 
 #define RDS_FMR_SIZE                   256
-#define RDS_FMR_POOL_SIZE              8192
+#define RDS_FMR_POOL_SIZE              4096
 
 #define RDS_IB_MAX_SGE                 8
 #define RDS_IB_RECV_SGE                2
 #define RDS_IB_DEFAULT_RECV_WR         1024
 #define RDS_IB_DEFAULT_SEND_WR         256
 
-#define RDS_IB_DEFAULT_RETRY_COUNT     2
-
 #define RDS_IB_SUPPORTED_PROTOCOLS     0x00000003      /* minor versions supported */
 
-#define RDS_IB_RECYCLE_BATCH_COUNT     32
-
-extern struct rw_semaphore rds_ib_devices_lock;
 extern struct list_head rds_ib_devices;
 
 /*
@@ -32,29 +24,20 @@ extern struct list_head rds_ib_devices;
  * try and minimize the amount of memory tied up both the device and
  * socket receive queues.
  */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
 struct rds_page_frag {
        struct list_head        f_item;
-       struct list_head        f_cache_entry;
-       struct scatterlist      f_sg;
+       struct page             *f_page;
+       unsigned long           f_offset;
+       dma_addr_t              f_mapped;
 };
 
 struct rds_ib_incoming {
        struct list_head        ii_frags;
-       struct list_head        ii_cache_entry;
        struct rds_incoming     ii_inc;
 };
 
-struct rds_ib_cache_head {
-       struct list_head *first;
-       unsigned long count;
-};
-
-struct rds_ib_refill_cache {
-       struct rds_ib_cache_head __percpu *percpu;
-       struct list_head         *xfer;
-       struct list_head         *ready;
-};
-
 struct rds_ib_connect_private {
        /* Add new fields at the end, and don't permute existing fields. */
        __be32                  dp_saddr;
@@ -68,7 +51,8 @@ struct rds_ib_connect_private {
 };
 
 struct rds_ib_send_work {
-       void                    *s_op;
+       struct rds_message      *s_rm;
+       struct rds_rdma_op      *s_op;
        struct ib_send_wr       s_wr;
        struct ib_sge           s_sge[RDS_IB_MAX_SGE];
        unsigned long           s_queued;
@@ -106,14 +90,12 @@ struct rds_ib_connection {
 
        /* tx */
        struct rds_ib_work_ring i_send_ring;
-       struct rm_data_op       *i_data_op;
+       struct rds_message      *i_rm;
        struct rds_header       *i_send_hdrs;
        u64                     i_send_hdrs_dma;
        struct rds_ib_send_work *i_sends;
-       atomic_t                i_signaled_sends;
 
        /* rx */
-       struct tasklet_struct   i_recv_tasklet;
        struct mutex            i_recv_mutex;
        struct rds_ib_work_ring i_recv_ring;
        struct rds_ib_incoming  *i_ibinc;
@@ -121,9 +103,8 @@ struct rds_ib_connection {
        struct rds_header       *i_recv_hdrs;
        u64                     i_recv_hdrs_dma;
        struct rds_ib_recv_work *i_recvs;
+       struct rds_page_frag    i_frag;
        u64                     i_ack_recv;     /* last ACK received */
-       struct rds_ib_refill_cache i_cache_incs;
-       struct rds_ib_refill_cache i_cache_frags;
 
        /* sending acks */
        unsigned long           i_ack_flags;
@@ -154,6 +135,7 @@ struct rds_ib_connection {
 
        /* Batched completions */
        unsigned int            i_unsignaled_wrs;
+       long                    i_unsignaled_bytes;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
@@ -175,20 +157,16 @@ struct rds_ib_device {
        struct ib_pd            *pd;
        struct ib_mr            *mr;
        struct rds_ib_mr_pool   *mr_pool;
+       int                     fmr_page_shift;
+       int                     fmr_page_size;
+       u64                     fmr_page_mask;
        unsigned int            fmr_max_remaps;
        unsigned int            max_fmrs;
        int                     max_sge;
        unsigned int            max_wrs;
-       unsigned int            max_initiator_depth;
-       unsigned int            max_responder_resources;
        spinlock_t              spinlock;       /* protect the above */
-       atomic_t                refcount;
-       struct work_struct      free_work;
 };
 
-#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
-#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
-
 /* bits for i_ack_flags */
 #define IB_ACK_IN_FLIGHT       0
 #define IB_ACK_REQUESTED       1
@@ -224,8 +202,6 @@ struct rds_ib_statistics {
        uint64_t        s_ib_rdma_mr_pool_flush;
        uint64_t        s_ib_rdma_mr_pool_wait;
        uint64_t        s_ib_rdma_mr_pool_depleted;
-       uint64_t        s_ib_atomic_cswp;
-       uint64_t        s_ib_atomic_fadd;
 };
 
 extern struct workqueue_struct *rds_ib_wq;
@@ -265,12 +241,12 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
 
 /* ib.c */
 extern struct rds_transport rds_ib_transport;
-struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
-void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
+extern void rds_ib_add_one(struct ib_device *device);
+extern void rds_ib_remove_one(struct ib_device *device);
 extern struct ib_client rds_ib_client;
 
+extern unsigned int fmr_pool_size;
 extern unsigned int fmr_message_size;
-extern unsigned int rds_ib_retry_count;
 
 extern spinlock_t ib_nodev_conns_lock;
 extern struct list_head ib_nodev_conns;
@@ -281,7 +257,7 @@ void rds_ib_conn_free(void *arg);
 int rds_ib_conn_connect(struct rds_connection *conn);
 void rds_ib_conn_shutdown(struct rds_connection *conn);
 void rds_ib_state_change(struct sock *sk);
-int rds_ib_listen_init(void);
+int __init rds_ib_listen_init(void);
 void rds_ib_listen_stop(void);
 void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -298,7 +274,15 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
-void rds_ib_destroy_nodev_conns(void);
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_ib_destroy_nodev_conns(void)
+{
+       __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
+}
+static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
+{
+       __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
+}
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
@@ -309,16 +293,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate);
 void rds_ib_flush_mrs(void);
 
 /* ib_recv.c */
-int rds_ib_recv_init(void);
+int __init rds_ib_recv_init(void);
 void rds_ib_recv_exit(void);
 int rds_ib_recv(struct rds_connection *conn);
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
-void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+                      gfp_t page_gfp, int prefill);
+void rds_ib_inc_purge(struct rds_incoming *inc);
 void rds_ib_inc_free(struct rds_incoming *inc);
-int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+                            size_t size);
 void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_ib_recv_tasklet_fn(unsigned long data);
 void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
 void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
 void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
@@ -339,19 +323,17 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
 extern wait_queue_head_t rds_ib_ring_empty_wait;
 
 /* ib_send.c */
-char *rds_ib_wc_status_str(enum ib_wc_status status);
 void rds_ib_xmit_complete(struct rds_connection *conn);
 int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                unsigned int hdr_off, unsigned int sg, unsigned int off);
 void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
 void rds_ib_send_init_ring(struct rds_ib_connection *ic);
 void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
 void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
-                            u32 *adv_credits, int need_posted, int max_posted);
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
+                            u32 *adv_credits, int need_posted);
 
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
@@ -360,7 +342,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
                                    unsigned int avail);
 
 /* ib_sysctl.c */
-int rds_ib_sysctl_init(void);
+int __init rds_ib_sysctl_init(void);
 void rds_ib_sysctl_exit(void);
 extern unsigned long rds_ib_sysctl_max_send_wr;
 extern unsigned long rds_ib_sysctl_max_recv_wr;
@@ -368,5 +350,22 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs;
 extern unsigned long rds_ib_sysctl_max_unsig_bytes;
 extern unsigned long rds_ib_sysctl_max_recv_allocation;
 extern unsigned int rds_ib_sysctl_flow_control;
+extern ctl_table rds_ib_sysctl_table[];
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+       return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+       return &sge[1];
+}
 
 #endif
index 8a09ee7db3c13bdd833784c4ee311e048a7c2789..f8e40e1a6038882eb5950da72506c9f70cbd7045 100644 (file)
  */
 #include <linux/kernel.h>
 #include <linux/in.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
 
 #include "rds.h"
 #include "ib.h"
 
-static char *rds_ib_event_type_strings[] = {
-#define RDS_IB_EVENT_STRING(foo) \
-               [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
-       RDS_IB_EVENT_STRING(CQ_ERR),
-       RDS_IB_EVENT_STRING(QP_FATAL),
-       RDS_IB_EVENT_STRING(QP_REQ_ERR),
-       RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
-       RDS_IB_EVENT_STRING(COMM_EST),
-       RDS_IB_EVENT_STRING(SQ_DRAINED),
-       RDS_IB_EVENT_STRING(PATH_MIG),
-       RDS_IB_EVENT_STRING(PATH_MIG_ERR),
-       RDS_IB_EVENT_STRING(DEVICE_FATAL),
-       RDS_IB_EVENT_STRING(PORT_ACTIVE),
-       RDS_IB_EVENT_STRING(PORT_ERR),
-       RDS_IB_EVENT_STRING(LID_CHANGE),
-       RDS_IB_EVENT_STRING(PKEY_CHANGE),
-       RDS_IB_EVENT_STRING(SM_CHANGE),
-       RDS_IB_EVENT_STRING(SRQ_ERR),
-       RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
-       RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
-       RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
-#undef RDS_IB_EVENT_STRING
-};
-
-static char *rds_ib_event_str(enum ib_event_type type)
-{
-       return rds_str_array(rds_ib_event_type_strings,
-                            ARRAY_SIZE(rds_ib_event_type_strings), type);
-};
-
 /*
  * Set the selected protocol version
  */
@@ -126,46 +94,24 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 {
        const struct rds_ib_connect_private *dp = NULL;
        struct rds_ib_connection *ic = conn->c_transport_data;
+       struct rds_ib_device *rds_ibdev;
        struct ib_qp_attr qp_attr;
        int err;
 
-       if (event->param.conn.private_data_len >= sizeof(*dp)) {
+       if (event->param.conn.private_data_len) {
                dp = event->param.conn.private_data;
 
-               /* make sure it isn't empty data */
-               if (dp->dp_protocol_major) {
-                       rds_ib_set_protocol(conn,
+               rds_ib_set_protocol(conn,
                                RDS_PROTOCOL(dp->dp_protocol_major,
-                               dp->dp_protocol_minor));
-                       rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
-               }
+                                       dp->dp_protocol_minor));
+               rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
        }
 
-       if (conn->c_version < RDS_PROTOCOL(3,1)) {
-               printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
-                      " no longer supported\n",
-                      &conn->c_faddr,
-                      RDS_PROTOCOL_MAJOR(conn->c_version),
-                      RDS_PROTOCOL_MINOR(conn->c_version));
-               rds_conn_destroy(conn);
-               return;
-       } else {
-               printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
-                      &conn->c_faddr,
-                      RDS_PROTOCOL_MAJOR(conn->c_version),
-                      RDS_PROTOCOL_MINOR(conn->c_version),
-                      ic->i_flowctl ? ", flow control" : "");
-       }
-
-       /*
-        * Init rings and fill recv. this needs to wait until protocol negotiation
-        * is complete, since ring layout is different from 3.0 to 3.1.
-        */
-       rds_ib_send_init_ring(ic);
-       rds_ib_recv_init_ring(ic);
-       /* Post receive buffers - as a side effect, this will update
-        * the posted credit count. */
-       rds_ib_recv_refill(conn, 1);
+       printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+                       &conn->c_laddr,
+                       RDS_PROTOCOL_MAJOR(conn->c_version),
+                       RDS_PROTOCOL_MINOR(conn->c_version),
+                       ic->i_flowctl ? ", flow control" : "");
 
        /* Tune RNR behavior */
        rds_ib_tune_rnr(ic, &qp_attr);
@@ -175,25 +121,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
        if (err)
                printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
 
-       /* update ib_device with this local ipaddr */
-       err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+       /* update ib_device with this local ipaddr & conn */
+       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+       err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
        if (err)
-               printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
-                       err);
+               printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
+       rds_ib_add_conn(rds_ibdev, conn);
 
        /* If the peer gave us the last packet it saw, process this as if
         * we had received a regular ACK. */
-       if (dp) {
-               /* dp structure start is not guaranteed to be 8 bytes aligned.
-                * Since dp_ack_seq is 64-bit extended load operations can be
-                * used so go through get_unaligned to avoid unaligned errors.
-                */
-               __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
-
-               if (dp_ack_seq)
-                       rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
-                                           NULL);
-       }
+       if (dp && dp->dp_ack_seq)
+               rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
 
        rds_connect_complete(conn);
 }
@@ -201,23 +139,18 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
                        struct rdma_conn_param *conn_param,
                        struct rds_ib_connect_private *dp,
-                       u32 protocol_version,
-                       u32 max_responder_resources,
-                       u32 max_initiator_depth)
+                       u32 protocol_version)
 {
-       struct rds_ib_connection *ic = conn->c_transport_data;
-       struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
-
        memset(conn_param, 0, sizeof(struct rdma_conn_param));
-
-       conn_param->responder_resources =
-               min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
-       conn_param->initiator_depth =
-               min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
-       conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
+       /* XXX tune these? */
+       conn_param->responder_resources = 1;
+       conn_param->initiator_depth = 1;
+       conn_param->retry_count = 7;
        conn_param->rnr_retry_count = 7;
 
        if (dp) {
+               struct rds_ib_connection *ic = conn->c_transport_data;
+
                memset(dp, 0, sizeof(*dp));
                dp->dp_saddr = conn->c_laddr;
                dp->dp_daddr = conn->c_faddr;
@@ -242,8 +175,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 
 static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
 {
-       rdsdebug("event %u (%s) data %p\n",
-                event->event, rds_ib_event_str(event->event), data);
+       rdsdebug("event %u data %p\n", event->event, data);
 }
 
 static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -251,19 +183,16 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
        struct rds_connection *conn = data;
        struct rds_ib_connection *ic = conn->c_transport_data;
 
-       rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
-                rds_ib_event_str(event->event));
+       rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
 
        switch (event->event) {
        case IB_EVENT_COMM_EST:
                rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
                break;
        default:
-               rdsdebug("Fatal QP Event %u (%s) "
-                       "- connection %pI4->%pI4, reconnecting\n",
-                       event->event, rds_ib_event_str(event->event),
-                       &conn->c_laddr, &conn->c_faddr);
-               rds_conn_drop(conn);
+               printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
+                      "on connection to %pI4\n", event->event,
+                      &conn->c_faddr);
                break;
        }
 }
@@ -280,16 +209,18 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
        struct rds_ib_device *rds_ibdev;
        int ret;
 
-       /*
-        * It's normal to see a null device if an incoming connection races
-        * with device removal, so we don't print a warning.
+       /* rds_ib_add_one creates a rds_ib_device object per IB device,
+        * and allocates a protection domain, memory range and FMR pool
+        * for each.  If that fails for any reason, it will not register
+        * the rds_ibdev at all.
         */
-       rds_ibdev = rds_ib_get_client_data(dev);
-       if (!rds_ibdev)
+       rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
+       if (rds_ibdev == NULL) {
+               if (printk_ratelimit())
+                       printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
+                                       dev->name);
                return -EOPNOTSUPP;
-
-       /* add the conn now so that connection establishment has the dev */
-       rds_ib_add_conn(rds_ibdev, conn);
+       }
 
        if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
                rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -360,7 +291,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                           ic->i_send_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_send_hdrs_dma, GFP_KERNEL);
-       if (!ic->i_send_hdrs) {
+       if (ic->i_send_hdrs == NULL) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent send failed\n");
                goto out;
@@ -370,7 +301,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                           ic->i_recv_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_recv_hdrs_dma, GFP_KERNEL);
-       if (!ic->i_recv_hdrs) {
+       if (ic->i_recv_hdrs == NULL) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent recv failed\n");
                goto out;
@@ -378,64 +309,54 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 
        ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
                                       &ic->i_ack_dma, GFP_KERNEL);
-       if (!ic->i_ack) {
+       if (ic->i_ack == NULL) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent ack failed\n");
                goto out;
        }
 
-       ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
-                                  ibdev_to_node(dev));
-       if (!ic->i_sends) {
+       ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
+       if (ic->i_sends == NULL) {
                ret = -ENOMEM;
                rdsdebug("send allocation failed\n");
                goto out;
        }
+       rds_ib_send_init_ring(ic);
 
-       ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
-                                  ibdev_to_node(dev));
-       if (!ic->i_recvs) {
+       ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
+       if (ic->i_recvs == NULL) {
                ret = -ENOMEM;
                rdsdebug("recv allocation failed\n");
                goto out;
        }
 
+       rds_ib_recv_init_ring(ic);
        rds_ib_recv_init_ack(ic);
 
+       /* Post receive buffers - as a side effect, this will update
+        * the posted credit count. */
+       rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
        rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
                 ic->i_send_cq, ic->i_recv_cq);
 
 out:
-       rds_ib_dev_put(rds_ibdev);
        return ret;
 }
 
-static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
 {
-       const struct rds_ib_connect_private *dp = event->param.conn.private_data;
        u16 common;
        u32 version = 0;
 
-       /*
-        * rdma_cm private data is odd - when there is any private data in the
+       /* rdma_cm private data is odd - when there is any private data in the
         * request, we will be given a pretty large buffer without telling us the
         * original size. The only way to tell the difference is by looking at
         * the contents, which are initialized to zero.
         * If the protocol version fields aren't set, this is a connection attempt
         * from an older version. This could could be 3.0 or 2.0 - we can't tell.
-        * We really should have changed this for OFED 1.3 :-(
-        */
-
-       /* Be paranoid. RDS always has privdata */
-       if (!event->param.conn.private_data_len) {
-               printk(KERN_NOTICE "RDS incoming connection has no private data, "
-                       "rejecting\n");
-               return 0;
-       }
-
-       /* Even if len is crap *now* I still want to check it. -ASG */
-       if (event->param.conn.private_data_len < sizeof (*dp) ||
-           dp->dp_protocol_major == 0)
+        * We really should have changed this for OFED 1.3 :-( */
+       if (dp->dp_protocol_major == 0)
                return RDS_PROTOCOL_3_0;
 
        common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
@@ -443,11 +364,13 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
                version = RDS_PROTOCOL_3_0;
                while ((common >>= 1) != 0)
                        version++;
-       } else
-               printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
-                               &dp->dp_saddr,
-                               dp->dp_protocol_major,
-                               dp->dp_protocol_minor);
+       } else if (printk_ratelimit()) {
+               printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+                       "incompatible protocol version %u.%u\n",
+                       &dp->dp_saddr,
+                       dp->dp_protocol_major,
+                       dp->dp_protocol_minor);
+       }
        return version;
 }
 
@@ -462,10 +385,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
        struct rds_ib_connection *ic = NULL;
        struct rdma_conn_param conn_param;
        u32 version;
-       int err = 1, destroy = 1;
+       int err, destroy = 1;
 
        /* Check whether the remote protocol version matches ours. */
-       version = rds_ib_protocol_compatible(event);
+       version = rds_ib_protocol_compatible(dp);
        if (!version)
                goto out;
 
@@ -501,6 +424,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                        /* Wait and see - our connect may still be succeeding */
                        rds_ib_stats_inc(s_ib_connect_raced);
                }
+               mutex_unlock(&conn->c_cm_lock);
                goto out;
        }
 
@@ -530,20 +454,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                goto out;
        }
 
-       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
-               event->param.conn.responder_resources,
-               event->param.conn.initiator_depth);
+       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
 
        /* rdma_accept() calls rdma_reject() internally if it fails */
        err = rdma_accept(cm_id, &conn_param);
-       if (err)
+       mutex_unlock(&conn->c_cm_lock);
+       if (err) {
                rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
+               goto out;
+       }
+
+       return 0;
 
 out:
-       if (conn)
-               mutex_unlock(&conn->c_cm_lock);
-       if (err)
-               rdma_reject(cm_id, NULL, 0);
+       rdma_reject(cm_id, NULL, 0);
        return destroy;
 }
 
@@ -567,8 +491,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
                goto out;
        }
 
-       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
-               UINT_MAX, UINT_MAX);
+       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
        ret = rdma_connect(cm_id, &conn_param);
        if (ret)
                rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -593,7 +517,7 @@ int rds_ib_conn_connect(struct rds_connection *conn)
        /* XXX I wonder what affect the port space has */
        /* delegate cm event handler to rdma_transport */
        ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
-                                    RDMA_PS_TCP, IB_QPT_RC);
+                                    RDMA_PS_TCP);
        if (IS_ERR(ic->i_cm_id)) {
                ret = PTR_ERR(ic->i_cm_id);
                ic->i_cm_id = NULL;
@@ -652,19 +576,9 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
                                ic->i_cm_id, err);
                }
 
-               /*
-                * We want to wait for tx and rx completion to finish
-                * before we tear down the connection, but we have to be
-                * careful not to get stuck waiting on a send ring that
-                * only has unsignaled sends in it.  We've shutdown new
-                * sends before getting here so by waiting for signaled
-                * sends to complete we're ensured that there will be no
-                * more tx processing.
-                */
                wait_event(rds_ib_ring_empty_wait,
-                          rds_ib_ring_empty(&ic->i_recv_ring) &&
-                          (atomic_read(&ic->i_signaled_sends) == 0));
-               tasklet_kill(&ic->i_recv_tasklet);
+                       rds_ib_ring_empty(&ic->i_send_ring) &&
+                       rds_ib_ring_empty(&ic->i_recv_ring));
 
                if (ic->i_send_hdrs)
                        ib_dma_free_coherent(dev,
@@ -715,12 +629,9 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
        BUG_ON(ic->rds_ibdev);
 
        /* Clear pending transmit */
-       if (ic->i_data_op) {
-               struct rds_message *rm;
-
-               rm = container_of(ic->i_data_op, struct rds_message, data);
-               rds_message_put(rm);
-               ic->i_data_op = NULL;
+       if (ic->i_rm) {
+               rds_message_put(ic->i_rm);
+               ic->i_rm = NULL;
        }
 
        /* Clear the ACK state */
@@ -754,27 +665,17 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 {
        struct rds_ib_connection *ic;
        unsigned long flags;
-       int ret;
 
        /* XXX too lazy? */
-       ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
-       if (!ic)
+       ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
+       if (ic == NULL)
                return -ENOMEM;
 
-       ret = rds_ib_recv_alloc_caches(ic);
-       if (ret) {
-               kfree(ic);
-               return ret;
-       }
-
        INIT_LIST_HEAD(&ic->ib_node);
-       tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
-                    (unsigned long) ic);
        mutex_init(&ic->i_recv_mutex);
 #ifndef KERNEL_HAS_ATOMIC64
        spin_lock_init(&ic->i_ack_lock);
 #endif
-       atomic_set(&ic->i_signaled_sends, 0);
 
        /*
         * rds_ib_conn_shutdown() waits for these to be emptied so they
@@ -816,8 +717,6 @@ void rds_ib_conn_free(void *arg)
        list_del(&ic->ib_node);
        spin_unlock_irq(lock_ptr);
 
-       rds_ib_recv_free_caches(ic);
-
        kfree(ic);
 }
 
index 273b8bff6ba448aa013932f5ac7c9f929f49aa70..81033af930207116e5eca3369725a845c8110873 100644 (file)
  *
  */
 #include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/rculist.h>
-#include <linux/llist.h>
 
 #include "rds.h"
+#include "rdma.h"
 #include "ib.h"
 
-static DEFINE_PER_CPU(unsigned long, clean_list_grace);
-#define CLEAN_LIST_BUSY_BIT 0
 
 /*
  * This is stored as mr->r_trans_private.
@@ -48,11 +44,7 @@ struct rds_ib_mr {
        struct rds_ib_device    *device;
        struct rds_ib_mr_pool   *pool;
        struct ib_fmr           *fmr;
-
-       struct llist_node       llnode;
-
-       /* unmap_list is for freeing */
-       struct list_head        unmap_list;
+       struct list_head        list;
        unsigned int            remap_count;
 
        struct scatterlist      *sg;
@@ -66,16 +58,14 @@ struct rds_ib_mr {
  */
 struct rds_ib_mr_pool {
        struct mutex            flush_lock;             /* serialize fmr invalidate */
-       struct delayed_work     flush_worker;           /* flush worker */
+       struct work_struct      flush_worker;           /* flush worker */
 
+       spinlock_t              list_lock;              /* protect variables below */
        atomic_t                item_count;             /* total # of MRs */
        atomic_t                dirty_count;            /* # dirty of MRs */
-
-       struct llist_head       drop_list;              /* MRs that have reached their max_maps limit */
-       struct llist_head       free_list;              /* unused MRs */
-       struct llist_head       clean_list;             /* global unused & unamapped MRs */
-       wait_queue_head_t       flush_wait;
-
+       struct list_head        drop_list;              /* MRs that have reached their max_maps limit */
+       struct list_head        free_list;              /* unused MRs */
+       struct list_head        clean_list;             /* unused & unamapped MRs */
        atomic_t                free_pinned;            /* memory pinned by free MRs */
        unsigned long           max_items;
        unsigned long           max_items_soft;
@@ -83,7 +73,7 @@ struct rds_ib_mr_pool {
        struct ib_fmr_attr      fmr_attr;
 };
 
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
 static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
 
@@ -92,17 +82,16 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
        struct rds_ib_device *rds_ibdev;
        struct rds_ib_ipaddr *i_ipaddr;
 
-       rcu_read_lock();
-       list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
-               list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+       list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+               spin_lock_irq(&rds_ibdev->spinlock);
+               list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
                        if (i_ipaddr->ipaddr == ipaddr) {
-                               atomic_inc(&rds_ibdev->refcount);
-                               rcu_read_unlock();
+                               spin_unlock_irq(&rds_ibdev->spinlock);
                                return rds_ibdev;
                        }
                }
+               spin_unlock_irq(&rds_ibdev->spinlock);
        }
-       rcu_read_unlock();
 
        return NULL;
 }
@@ -118,7 +107,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
        i_ipaddr->ipaddr = ipaddr;
 
        spin_lock_irq(&rds_ibdev->spinlock);
-       list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+       list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
        spin_unlock_irq(&rds_ibdev->spinlock);
 
        return 0;
@@ -126,24 +115,17 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 
 static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 {
-       struct rds_ib_ipaddr *i_ipaddr;
-       struct rds_ib_ipaddr *to_free = NULL;
-
+       struct rds_ib_ipaddr *i_ipaddr, *next;
 
        spin_lock_irq(&rds_ibdev->spinlock);
-       list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+       list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
                if (i_ipaddr->ipaddr == ipaddr) {
-                       list_del_rcu(&i_ipaddr->list);
-                       to_free = i_ipaddr;
+                       list_del(&i_ipaddr->list);
+                       kfree(i_ipaddr);
                        break;
                }
        }
        spin_unlock_irq(&rds_ibdev->spinlock);
-
-       if (to_free) {
-               synchronize_rcu();
-               kfree(to_free);
-       }
 }
 
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
@@ -151,10 +133,8 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
        struct rds_ib_device *rds_ibdev_old;
 
        rds_ibdev_old = rds_ib_get_device(ipaddr);
-       if (rds_ibdev_old) {
+       if (rds_ibdev_old)
                rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
-               rds_ib_dev_put(rds_ibdev_old);
-       }
 
        return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
 }
@@ -169,13 +149,12 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
        BUG_ON(list_empty(&ic->ib_node));
        list_del(&ic->ib_node);
 
-       spin_lock(&rds_ibdev->spinlock);
+       spin_lock_irq(&rds_ibdev->spinlock);
        list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
-       spin_unlock(&rds_ibdev->spinlock);
+       spin_unlock_irq(&rds_ibdev->spinlock);
        spin_unlock_irq(&ib_nodev_conns_lock);
 
        ic->rds_ibdev = rds_ibdev;
-       atomic_inc(&rds_ibdev->refcount);
 }
 
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -195,21 +174,24 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
        spin_unlock(&ib_nodev_conns_lock);
 
        ic->rds_ibdev = NULL;
-       rds_ib_dev_put(rds_ibdev);
 }
 
-void rds_ib_destroy_nodev_conns(void)
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
 {
        struct rds_ib_connection *ic, *_ic;
        LIST_HEAD(tmp_list);
 
        /* avoid calling conn_destroy with irqs off */
-       spin_lock_irq(&ib_nodev_conns_lock);
-       list_splice(&ib_nodev_conns, &tmp_list);
-       spin_unlock_irq(&ib_nodev_conns_lock);
-
-       list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
+       spin_lock_irq(list_lock);
+       list_splice(list, &tmp_list);
+       INIT_LIST_HEAD(list);
+       spin_unlock_irq(list_lock);
+
+       list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
+               if (ic->conn->c_passive)
+                       rds_conn_destroy(ic->conn->c_passive);
                rds_conn_destroy(ic->conn);
+       }
 }
 
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
@@ -220,16 +202,16 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
        if (!pool)
                return ERR_PTR(-ENOMEM);
 
-       init_llist_head(&pool->free_list);
-       init_llist_head(&pool->drop_list);
-       init_llist_head(&pool->clean_list);
+       INIT_LIST_HEAD(&pool->free_list);
+       INIT_LIST_HEAD(&pool->drop_list);
+       INIT_LIST_HEAD(&pool->clean_list);
        mutex_init(&pool->flush_lock);
-       init_waitqueue_head(&pool->flush_wait);
-       INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+       spin_lock_init(&pool->list_lock);
+       INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
 
        pool->fmr_attr.max_pages = fmr_message_size;
        pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
-       pool->fmr_attr.page_shift = PAGE_SHIFT;
+       pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
        pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
 
        /* We never allow more than max_items MRs to be allocated.
@@ -253,52 +235,34 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
 {
-       cancel_delayed_work_sync(&pool->flush_worker);
-       rds_ib_flush_mr_pool(pool, 1, NULL);
-       WARN_ON(atomic_read(&pool->item_count));
-       WARN_ON(atomic_read(&pool->free_pinned));
+       flush_workqueue(rds_wq);
+       rds_ib_flush_mr_pool(pool, 1);
+       BUG_ON(atomic_read(&pool->item_count));
+       BUG_ON(atomic_read(&pool->free_pinned));
        kfree(pool);
 }
 
 static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
 {
        struct rds_ib_mr *ibmr = NULL;
-       struct llist_node *ret;
-       unsigned long *flag;
+       unsigned long flags;
 
-       preempt_disable();
-       flag = this_cpu_ptr(&clean_list_grace);
-       set_bit(CLEAN_LIST_BUSY_BIT, flag);
-       ret = llist_del_first(&pool->clean_list);
-       if (ret)
-               ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+       spin_lock_irqsave(&pool->list_lock, flags);
+       if (!list_empty(&pool->clean_list)) {
+               ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
+               list_del_init(&ibmr->list);
+       }
+       spin_unlock_irqrestore(&pool->list_lock, flags);
 
-       clear_bit(CLEAN_LIST_BUSY_BIT, flag);
-       preempt_enable();
        return ibmr;
 }
 
-static inline void wait_clean_list_grace(void)
-{
-       int cpu;
-       unsigned long *flag;
-
-       for_each_online_cpu(cpu) {
-               flag = &per_cpu(clean_list_grace, cpu);
-               while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
-                       cpu_relax();
-       }
-}
-
 static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 {
        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
        struct rds_ib_mr *ibmr = NULL;
        int err = 0, iter = 0;
 
-       if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-               schedule_delayed_work(&pool->flush_worker, 10);
-
        while (1) {
                ibmr = rds_ib_reuse_fmr(pool);
                if (ibmr)
@@ -325,24 +289,19 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 
                /* We do have some empty MRs. Flush them out. */
                rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
-               rds_ib_flush_mr_pool(pool, 0, &ibmr);
-               if (ibmr)
-                       return ibmr;
+               rds_ib_flush_mr_pool(pool, 0);
        }
 
-       ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
+       ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
        if (!ibmr) {
                err = -ENOMEM;
                goto out_no_cigar;
        }
 
-       memset(ibmr, 0, sizeof(*ibmr));
-
        ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
                        (IB_ACCESS_LOCAL_WRITE |
                         IB_ACCESS_REMOTE_READ |
-                        IB_ACCESS_REMOTE_WRITE|
-                        IB_ACCESS_REMOTE_ATOMIC),
+                        IB_ACCESS_REMOTE_WRITE),
                        &pool->fmr_attr);
        if (IS_ERR(ibmr->fmr)) {
                err = PTR_ERR(ibmr->fmr);
@@ -390,13 +349,13 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
                unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
                u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
 
-               if (dma_addr & ~PAGE_MASK) {
+               if (dma_addr & ~rds_ibdev->fmr_page_mask) {
                        if (i > 0)
                                return -EINVAL;
                        else
                                ++page_cnt;
                }
-               if ((dma_addr + dma_len) & ~PAGE_MASK) {
+               if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
                        if (i < sg_dma_len - 1)
                                return -EINVAL;
                        else
@@ -406,12 +365,11 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
                len += dma_len;
        }
 
-       page_cnt += len >> PAGE_SHIFT;
+       page_cnt += len >> rds_ibdev->fmr_page_shift;
        if (page_cnt > fmr_message_size)
                return -EINVAL;
 
-       dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
-                                rdsibdev_to_node(rds_ibdev));
+       dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
        if (!dma_pages)
                return -ENOMEM;
 
@@ -420,9 +378,9 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
                unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
                u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
 
-               for (j = 0; j < dma_len; j += PAGE_SIZE)
+               for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
                        dma_pages[page_cnt++] =
-                               (dma_addr & PAGE_MASK) + j;
+                               (dma_addr & rds_ibdev->fmr_page_mask) + j;
        }
 
        ret = ib_map_phys_fmr(ibmr->fmr,
@@ -485,7 +443,6 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 
                        /* FIXME we need a way to tell a r/w MR
                         * from a r/o MR */
-                       BUG_ON(irqs_disabled());
                        set_page_dirty(page);
                        put_page(page);
                }
@@ -520,108 +477,34 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
        return 0;
 }
 
-/*
- * given an llist of mrs, put them all into the list_head for more processing
- */
-static void llist_append_to_list(struct llist_head *llist, struct list_head *list)
-{
-       struct rds_ib_mr *ibmr;
-       struct llist_node *node;
-       struct llist_node *next;
-
-       node = llist_del_all(llist);
-       while (node) {
-               next = node->next;
-               ibmr = llist_entry(node, struct rds_ib_mr, llnode);
-               list_add_tail(&ibmr->unmap_list, list);
-               node = next;
-       }
-}
-
-/*
- * this takes a list head of mrs and turns it into linked llist nodes
- * of clusters.  Each cluster has linked llist nodes of
- * MR_CLUSTER_SIZE mrs that are ready for reuse.
- */
-static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
-                               struct list_head *list,
-                               struct llist_node **nodes_head,
-                               struct llist_node **nodes_tail)
-{
-       struct rds_ib_mr *ibmr;
-       struct llist_node *cur = NULL;
-       struct llist_node **next = nodes_head;
-
-       list_for_each_entry(ibmr, list, unmap_list) {
-               cur = &ibmr->llnode;
-               *next = cur;
-               next = &cur->next;
-       }
-       *next = NULL;
-       *nodes_tail = cur;
-}
-
 /*
  * Flush our pool of MRs.
  * At a minimum, all currently unused MRs are unmapped.
  * If the number of MRs allocated exceeds the limit, we also try
  * to free as many MRs as needed to get back to this limit.
  */
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
-                               int free_all, struct rds_ib_mr **ibmr_ret)
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
 {
        struct rds_ib_mr *ibmr, *next;
-       struct llist_node *clean_nodes;
-       struct llist_node *clean_tail;
        LIST_HEAD(unmap_list);
        LIST_HEAD(fmr_list);
        unsigned long unpinned = 0;
+       unsigned long flags;
        unsigned int nfreed = 0, ncleaned = 0, free_goal;
        int ret = 0;
 
        rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
 
-       if (ibmr_ret) {
-               DEFINE_WAIT(wait);
-               while(!mutex_trylock(&pool->flush_lock)) {
-                       ibmr = rds_ib_reuse_fmr(pool);
-                       if (ibmr) {
-                               *ibmr_ret = ibmr;
-                               finish_wait(&pool->flush_wait, &wait);
-                               goto out_nolock;
-                       }
-
-                       prepare_to_wait(&pool->flush_wait, &wait,
-                                       TASK_UNINTERRUPTIBLE);
-                       if (llist_empty(&pool->clean_list))
-                               schedule();
-
-                       ibmr = rds_ib_reuse_fmr(pool);
-                       if (ibmr) {
-                               *ibmr_ret = ibmr;
-                               finish_wait(&pool->flush_wait, &wait);
-                               goto out_nolock;
-                       }
-               }
-               finish_wait(&pool->flush_wait, &wait);
-       } else
-               mutex_lock(&pool->flush_lock);
-
-       if (ibmr_ret) {
-               ibmr = rds_ib_reuse_fmr(pool);
-               if (ibmr) {
-                       *ibmr_ret = ibmr;
-                       goto out;
-               }
-       }
+       mutex_lock(&pool->flush_lock);
 
+       spin_lock_irqsave(&pool->list_lock, flags);
        /* Get the list of all MRs to be dropped. Ordering matters -
-        * we want to put drop_list ahead of free_list.
-        */
-       llist_append_to_list(&pool->drop_list, &unmap_list);
-       llist_append_to_list(&pool->free_list, &unmap_list);
+        * we want to put drop_list ahead of free_list. */
+       list_splice_init(&pool->free_list, &unmap_list);
+       list_splice_init(&pool->drop_list, &unmap_list);
        if (free_all)
-               llist_append_to_list(&pool->clean_list, &unmap_list);
+               list_splice_init(&pool->clean_list, &unmap_list);
+       spin_unlock_irqrestore(&pool->list_lock, flags);
 
        free_goal = rds_ib_flush_goal(pool, free_all);
 
@@ -629,20 +512,19 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
                goto out;
 
        /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
-       list_for_each_entry(ibmr, &unmap_list, unmap_list)
+       list_for_each_entry(ibmr, &unmap_list, list)
                list_add(&ibmr->fmr->list, &fmr_list);
-
        ret = ib_unmap_fmr(&fmr_list);
        if (ret)
                printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
 
        /* Now we can destroy the DMA mapping and unpin any pages */
-       list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
+       list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
                unpinned += ibmr->sg_len;
                __rds_ib_teardown_mr(ibmr);
                if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
                        rds_ib_stats_inc(s_ib_rdma_mr_free);
-                       list_del(&ibmr->unmap_list);
+                       list_del(&ibmr->list);
                        ib_dealloc_fmr(ibmr->fmr);
                        kfree(ibmr);
                        nfreed++;
@@ -650,27 +532,9 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
                ncleaned++;
        }
 
-       if (!list_empty(&unmap_list)) {
-               /* we have to make sure that none of the things we're about
-                * to put on the clean list would race with other cpus trying
-                * to pull items off.  The llist would explode if we managed to
-                * remove something from the clean list and then add it back again
-                * while another CPU was spinning on that same item in llist_del_first.
-                *
-                * This is pretty unlikely, but just in case  wait for an llist grace period
-                * here before adding anything back into the clean list.
-                */
-               wait_clean_list_grace();
-
-               list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
-               if (ibmr_ret)
-                       *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
-
-               /* more than one entry in llist nodes */
-               if (clean_nodes->next)
-                       llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
-
-       }
+       spin_lock_irqsave(&pool->list_lock, flags);
+       list_splice(&unmap_list, &pool->clean_list);
+       spin_unlock_irqrestore(&pool->list_lock, flags);
 
        atomic_sub(unpinned, &pool->free_pinned);
        atomic_sub(ncleaned, &pool->dirty_count);
@@ -678,17 +542,14 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 
 out:
        mutex_unlock(&pool->flush_lock);
-       if (waitqueue_active(&pool->flush_wait))
-               wake_up(&pool->flush_wait);
-out_nolock:
        return ret;
 }
 
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
 {
-       struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
+       struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
 
-       rds_ib_flush_mr_pool(pool, 0, NULL);
+       rds_ib_flush_mr_pool(pool, 0);
 }
 
 void rds_ib_free_mr(void *trans_private, int invalidate)
@@ -696,48 +557,47 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
        struct rds_ib_mr *ibmr = trans_private;
        struct rds_ib_device *rds_ibdev = ibmr->device;
        struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+       unsigned long flags;
 
        rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 
        /* Return it to the pool's free list */
+       spin_lock_irqsave(&pool->list_lock, flags);
        if (ibmr->remap_count >= pool->fmr_attr.max_maps)
-               llist_add(&ibmr->llnode, &pool->drop_list);
+               list_add(&ibmr->list, &pool->drop_list);
        else
-               llist_add(&ibmr->llnode, &pool->free_list);
+               list_add(&ibmr->list, &pool->free_list);
 
        atomic_add(ibmr->sg_len, &pool->free_pinned);
        atomic_inc(&pool->dirty_count);
+       spin_unlock_irqrestore(&pool->list_lock, flags);
 
        /* If we've pinned too many pages, request a flush */
-       if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
-           atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-               schedule_delayed_work(&pool->flush_worker, 10);
+       if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+        || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+               queue_work(rds_wq, &pool->flush_worker);
 
        if (invalidate) {
                if (likely(!in_interrupt())) {
-                       rds_ib_flush_mr_pool(pool, 0, NULL);
+                       rds_ib_flush_mr_pool(pool, 0);
                } else {
                        /* We get here if the user created a MR marked
                         * as use_once and invalidate at the same time. */
-                       schedule_delayed_work(&pool->flush_worker, 10);
+                       queue_work(rds_wq, &pool->flush_worker);
                }
        }
-
-       rds_ib_dev_put(rds_ibdev);
 }
 
 void rds_ib_flush_mrs(void)
 {
        struct rds_ib_device *rds_ibdev;
 
-       down_read(&rds_ib_devices_lock);
        list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
                struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 
                if (pool)
-                       rds_ib_flush_mr_pool(pool, 0, NULL);
+                       rds_ib_flush_mr_pool(pool, 0);
        }
-       up_read(&rds_ib_devices_lock);
 }
 
 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
@@ -769,7 +629,6 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
 
        ibmr->device = rds_ibdev;
-       rds_ibdev = NULL;
 
  out:
        if (ret) {
@@ -777,8 +636,5 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                        rds_ib_free_mr(ibmr, 0);
                ibmr = ERR_PTR(ret);
        }
-       if (rds_ibdev)
-               rds_ib_dev_put(rds_ibdev);
        return ibmr;
 }
-
index 1b981a4e42c214d575a838b096da368a7f0316c6..36d931573ff4f4cb52aec684564e142bbea388e9 100644 (file)
@@ -31,7 +31,6 @@
  *
  */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <rdma/rdma_cm.h>
@@ -43,6 +42,42 @@ static struct kmem_cache *rds_ib_incoming_slab;
 static struct kmem_cache *rds_ib_frag_slab;
 static atomic_t        rds_ib_allocation = ATOMIC_INIT(0);
 
+static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
+{
+       rdsdebug("frag %p page %p\n", frag, frag->f_page);
+       __free_page(frag->f_page);
+       frag->f_page = NULL;
+}
+
+static void rds_ib_frag_free(struct rds_page_frag *frag)
+{
+       rdsdebug("frag %p page %p\n", frag, frag->f_page);
+       BUG_ON(frag->f_page != NULL);
+       kmem_cache_free(rds_ib_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time.  Its fragments are posted in order.  This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
+                                  struct rds_ib_recv_work *recv)
+{
+       struct rds_page_frag *frag = recv->r_frag;
+
+       rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+       if (frag->f_mapped)
+               ib_dma_unmap_page(ic->i_cm_id->device,
+                              frag->f_mapped,
+                              RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+       frag->f_mapped = 0;
+}
+
 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 {
        struct rds_ib_recv_work *recv;
@@ -59,161 +94,16 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
                recv->r_wr.sg_list = recv->r_sge;
                recv->r_wr.num_sge = RDS_IB_RECV_SGE;
 
-               sge = &recv->r_sge[0];
-               sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
-               sge->length = sizeof(struct rds_header);
-               sge->lkey = ic->i_mr->lkey;
-
-               sge = &recv->r_sge[1];
+               sge = rds_ib_data_sge(ic, recv->r_sge);
                sge->addr = 0;
                sge->length = RDS_FRAG_SIZE;
                sge->lkey = ic->i_mr->lkey;
-       }
-}
-
-/*
- * The entire 'from' list, including the from element itself, is put on
- * to the tail of the 'to' list.
- */
-static void list_splice_entire_tail(struct list_head *from,
-                                   struct list_head *to)
-{
-       struct list_head *from_last = from->prev;
-
-       list_splice_tail(from_last, to);
-       list_add_tail(from_last, to);
-}
-
-static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
-{
-       struct list_head *tmp;
-
-       tmp = xchg(&cache->xfer, NULL);
-       if (tmp) {
-               if (cache->ready)
-                       list_splice_entire_tail(tmp, cache->ready);
-               else
-                       cache->ready = tmp;
-       }
-}
-
-static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
-{
-       struct rds_ib_cache_head *head;
-       int cpu;
-
-       cache->percpu = alloc_percpu(struct rds_ib_cache_head);
-       if (!cache->percpu)
-              return -ENOMEM;
-
-       for_each_possible_cpu(cpu) {
-               head = per_cpu_ptr(cache->percpu, cpu);
-               head->first = NULL;
-               head->count = 0;
-       }
-       cache->xfer = NULL;
-       cache->ready = NULL;
-
-       return 0;
-}
-
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
-{
-       int ret;
-
-       ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
-       if (!ret) {
-               ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
-               if (ret)
-                       free_percpu(ic->i_cache_incs.percpu);
-       }
-
-       return ret;
-}
 
-static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
-                                         struct list_head *caller_list)
-{
-       struct rds_ib_cache_head *head;
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               head = per_cpu_ptr(cache->percpu, cpu);
-               if (head->first) {
-                       list_splice_entire_tail(head->first, caller_list);
-                       head->first = NULL;
-               }
-       }
-
-       if (cache->ready) {
-               list_splice_entire_tail(cache->ready, caller_list);
-               cache->ready = NULL;
-       }
-}
-
-void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
-{
-       struct rds_ib_incoming *inc;
-       struct rds_ib_incoming *inc_tmp;
-       struct rds_page_frag *frag;
-       struct rds_page_frag *frag_tmp;
-       LIST_HEAD(list);
-
-       rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
-       rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
-       free_percpu(ic->i_cache_incs.percpu);
-
-       list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
-               list_del(&inc->ii_cache_entry);
-               WARN_ON(!list_empty(&inc->ii_frags));
-               kmem_cache_free(rds_ib_incoming_slab, inc);
-       }
-
-       rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
-       rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
-       free_percpu(ic->i_cache_frags.percpu);
-
-       list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
-               list_del(&frag->f_cache_entry);
-               WARN_ON(!list_empty(&frag->f_item));
-               kmem_cache_free(rds_ib_frag_slab, frag);
-       }
-}
-
-/* fwd decl */
-static void rds_ib_recv_cache_put(struct list_head *new_item,
-                                 struct rds_ib_refill_cache *cache);
-static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
-
-
-/* Recycle frag and attached recv buffer f_sg */
-static void rds_ib_frag_free(struct rds_ib_connection *ic,
-                            struct rds_page_frag *frag)
-{
-       rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
-
-       rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
-}
-
-/* Recycle inc after freeing attached frags */
-void rds_ib_inc_free(struct rds_incoming *inc)
-{
-       struct rds_ib_incoming *ibinc;
-       struct rds_page_frag *frag;
-       struct rds_page_frag *pos;
-       struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
-
-       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
-
-       /* Free attached frags */
-       list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
-               list_del_init(&frag->f_item);
-               rds_ib_frag_free(ic, frag);
+               sge = rds_ib_header_sge(ic, recv->r_sge);
+               sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+               sge->length = sizeof(struct rds_header);
+               sge->lkey = ic->i_mr->lkey;
        }
-       BUG_ON(!list_empty(&ibinc->ii_frags));
-
-       rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
-       rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
 }
 
 static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
@@ -224,8 +114,10 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
                recv->r_ibinc = NULL;
        }
        if (recv->r_frag) {
-               ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
-               rds_ib_frag_free(ic, recv->r_frag);
+               rds_ib_recv_unmap_page(ic, recv);
+               if (recv->r_frag->f_page)
+                       rds_ib_frag_drop_page(recv->r_frag);
+               rds_ib_frag_free(recv->r_frag);
                recv->r_frag = NULL;
        }
 }
@@ -236,111 +128,83 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
 
        for (i = 0; i < ic->i_recv_ring.w_nr; i++)
                rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
-}
 
-static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
-                                                    gfp_t slab_mask)
-{
-       struct rds_ib_incoming *ibinc;
-       struct list_head *cache_item;
-       int avail_allocs;
-
-       cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
-       if (cache_item) {
-               ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
-       } else {
-               avail_allocs = atomic_add_unless(&rds_ib_allocation,
-                                                1, rds_ib_sysctl_max_recv_allocation);
-               if (!avail_allocs) {
-                       rds_ib_stats_inc(s_ib_rx_alloc_limit);
-                       return NULL;
-               }
-               ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
-               if (!ibinc) {
-                       atomic_dec(&rds_ib_allocation);
-                       return NULL;
-               }
-       }
-       INIT_LIST_HEAD(&ibinc->ii_frags);
-       rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
-
-       return ibinc;
-}
-
-static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
-                                                   gfp_t slab_mask, gfp_t page_mask)
-{
-       struct rds_page_frag *frag;
-       struct list_head *cache_item;
-       int ret;
-
-       cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
-       if (cache_item) {
-               frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
-       } else {
-               frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
-               if (!frag)
-                       return NULL;
-
-               sg_init_table(&frag->f_sg, 1);
-               ret = rds_page_remainder_alloc(&frag->f_sg,
-                                              RDS_FRAG_SIZE, page_mask);
-               if (ret) {
-                       kmem_cache_free(rds_ib_frag_slab, frag);
-                       return NULL;
-               }
-       }
-
-       INIT_LIST_HEAD(&frag->f_item);
-
-       return frag;
+       if (ic->i_frag.f_page)
+               rds_ib_frag_drop_page(&ic->i_frag);
 }
 
 static int rds_ib_recv_refill_one(struct rds_connection *conn,
-                                 struct rds_ib_recv_work *recv, int prefill)
+                                 struct rds_ib_recv_work *recv,
+                                 gfp_t kptr_gfp, gfp_t page_gfp)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
+       dma_addr_t dma_addr;
        struct ib_sge *sge;
        int ret = -ENOMEM;
-       gfp_t slab_mask = GFP_NOWAIT;
-       gfp_t page_mask = GFP_NOWAIT;
 
-       if (prefill) {
-               slab_mask = GFP_KERNEL;
-               page_mask = GFP_HIGHUSER;
+       if (recv->r_ibinc == NULL) {
+               if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) {
+                       rds_ib_stats_inc(s_ib_rx_alloc_limit);
+                       goto out;
+               }
+               recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
+                                                kptr_gfp);
+               if (recv->r_ibinc == NULL)
+                       goto out;
+               atomic_inc(&rds_ib_allocation);
+               INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
+               rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
        }
 
-       if (!ic->i_cache_incs.ready)
-               rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
-       if (!ic->i_cache_frags.ready)
-               rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+       if (recv->r_frag == NULL) {
+               recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
+               if (recv->r_frag == NULL)
+                       goto out;
+               INIT_LIST_HEAD(&recv->r_frag->f_item);
+               recv->r_frag->f_page = NULL;
+       }
 
-       /*
-        * ibinc was taken from recv if recv contained the start of a message.
-        * recvs that were continuations will still have this allocated.
-        */
-       if (!recv->r_ibinc) {
-               recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
-               if (!recv->r_ibinc)
+       if (ic->i_frag.f_page == NULL) {
+               ic->i_frag.f_page = alloc_page(page_gfp);
+               if (ic->i_frag.f_page == NULL)
                        goto out;
+               ic->i_frag.f_offset = 0;
        }
 
-       WARN_ON(recv->r_frag); /* leak! */
-       recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
-       if (!recv->r_frag)
+       dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+                                 ic->i_frag.f_page,
+                                 ic->i_frag.f_offset,
+                                 RDS_FRAG_SIZE,
+                                 DMA_FROM_DEVICE);
+       if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
                goto out;
 
-       ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
-                           1, DMA_FROM_DEVICE);
-       WARN_ON(ret != 1);
+       /*
+        * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
+        * must be called on this recv.  This happens as completions hit
+        * in order or on connection shutdown.
+        */
+       recv->r_frag->f_page = ic->i_frag.f_page;
+       recv->r_frag->f_offset = ic->i_frag.f_offset;
+       recv->r_frag->f_mapped = dma_addr;
 
-       sge = &recv->r_sge[0];
+       sge = rds_ib_data_sge(ic, recv->r_sge);
+       sge->addr = dma_addr;
+       sge->length = RDS_FRAG_SIZE;
+
+       sge = rds_ib_header_sge(ic, recv->r_sge);
        sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
        sge->length = sizeof(struct rds_header);
 
-       sge = &recv->r_sge[1];
-       sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg);
-       sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg);
+       get_page(recv->r_frag->f_page);
+
+       if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+               ic->i_frag.f_offset += RDS_FRAG_SIZE;
+       } else {
+               put_page(ic->i_frag.f_page);
+               ic->i_frag.f_page = NULL;
+               ic->i_frag.f_offset = 0;
+       }
 
        ret = 0;
 out:
@@ -350,11 +214,13 @@ out:
 /*
  * This tries to allocate and post unused work requests after making sure that
  * they have all the allocations they need to queue received fragments into
- * sockets.
+ * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
  *
  * -1 is returned if posting fails due to temporary resource exhaustion.
  */
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+                      gfp_t page_gfp, int prefill)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
        struct rds_ib_recv_work *recv;
@@ -363,33 +229,33 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
        int ret = 0;
        u32 pos;
 
-       while ((prefill || rds_conn_up(conn)) &&
-              rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+       while ((prefill || rds_conn_up(conn))
+                       && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
                if (pos >= ic->i_recv_ring.w_nr) {
                        printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
                                        pos);
+                       ret = -EINVAL;
                        break;
                }
 
                recv = &ic->i_recvs[pos];
-               ret = rds_ib_recv_refill_one(conn, recv, prefill);
+               ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
                if (ret) {
+                       ret = -1;
                        break;
                }
 
                /* XXX when can this fail? */
                ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
                rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
-                        recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
-                        (long) ib_sg_dma_address(
-                               ic->i_cm_id->device,
-                               &recv->r_frag->f_sg),
-                       ret);
+                        recv->r_ibinc, recv->r_frag->f_page,
+                        (long) recv->r_frag->f_mapped, ret);
                if (ret) {
                        rds_ib_conn_error(conn, "recv post on "
                               "%pI4 returned %d, disconnecting and "
                               "reconnecting\n", &conn->c_faddr,
                               ret);
+                       ret = -1;
                        break;
                }
 
@@ -402,82 +268,48 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 
        if (ret)
                rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+       return ret;
 }
 
-/*
- * We want to recycle several types of recv allocations, like incs and frags.
- * To use this, the *_free() function passes in the ptr to a list_head within
- * the recyclee, as well as the cache to put it on.
- *
- * First, we put the memory on a percpu list. When this reaches a certain size,
- * We move it to an intermediate non-percpu list in a lockless manner, with some
- * xchg/compxchg wizardry.
- *
- * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
- * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
- * list_empty() will return true with one element is actually present.
- */
-static void rds_ib_recv_cache_put(struct list_head *new_item,
-                                struct rds_ib_refill_cache *cache)
+void rds_ib_inc_purge(struct rds_incoming *inc)
 {
-       unsigned long flags;
-       struct list_head *old, *chpfirst;
-
-       local_irq_save(flags);
-
-       chpfirst = __this_cpu_read(cache->percpu->first);
-       if (!chpfirst)
-               INIT_LIST_HEAD(new_item);
-       else /* put on front */
-               list_add_tail(new_item, chpfirst);
-
-       __this_cpu_write(cache->percpu->first, new_item);
-       __this_cpu_inc(cache->percpu->count);
+       struct rds_ib_incoming *ibinc;
+       struct rds_page_frag *frag;
+       struct rds_page_frag *pos;
 
-       if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
-               goto end;
+       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+       rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
 
-       /*
-        * Return our per-cpu first list to the cache's xfer by atomically
-        * grabbing the current xfer list, appending it to our per-cpu list,
-        * and then atomically returning that entire list back to the
-        * cache's xfer list as long as it's still empty.
-        */
-       do {
-               old = xchg(&cache->xfer, NULL);
-               if (old)
-                       list_splice_entire_tail(old, chpfirst);
-               old = cmpxchg(&cache->xfer, NULL, chpfirst);
-       } while (old);
-
-
-       __this_cpu_write(cache->percpu->first, NULL);
-       __this_cpu_write(cache->percpu->count, 0);
-end:
-       local_irq_restore(flags);
+       list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+               list_del_init(&frag->f_item);
+               rds_ib_frag_drop_page(frag);
+               rds_ib_frag_free(frag);
+       }
 }
 
-static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
+void rds_ib_inc_free(struct rds_incoming *inc)
 {
-       struct list_head *head = cache->ready;
-
-       if (head) {
-               if (!list_empty(head)) {
-                       cache->ready = head->next;
-                       list_del_init(head);
-               } else
-                       cache->ready = NULL;
-       }
+       struct rds_ib_incoming *ibinc;
+
+       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
 
-       return head;
+       rds_ib_inc_purge(inc);
+       rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+       BUG_ON(!list_empty(&ibinc->ii_frags));
+       kmem_cache_free(rds_ib_incoming_slab, ibinc);
+       atomic_dec(&rds_ib_allocation);
+       BUG_ON(atomic_read(&rds_ib_allocation) < 0);
 }
 
-int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+                           size_t size)
 {
        struct rds_ib_incoming *ibinc;
        struct rds_page_frag *frag;
+       struct iovec *iov = first_iov;
        unsigned long to_copy;
        unsigned long frag_off = 0;
+       unsigned long iov_off = 0;
        int copied = 0;
        int ret;
        u32 len;
@@ -486,25 +318,37 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
        frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
        len = be32_to_cpu(inc->i_hdr.h_len);
 
-       while (iov_iter_count(to) && copied < len) {
+       while (copied < size && copied < len) {
                if (frag_off == RDS_FRAG_SIZE) {
                        frag = list_entry(frag->f_item.next,
                                          struct rds_page_frag, f_item);
                        frag_off = 0;
                }
-               to_copy = min_t(unsigned long, iov_iter_count(to),
-                               RDS_FRAG_SIZE - frag_off);
+               while (iov_off == iov->iov_len) {
+                       iov_off = 0;
+                       iov++;
+               }
+
+               to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+               to_copy = min_t(size_t, to_copy, size - copied);
                to_copy = min_t(unsigned long, to_copy, len - copied);
 
+               rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+                        "[%p, %lu] + %lu\n",
+                        to_copy, iov->iov_base, iov->iov_len, iov_off,
+                        frag->f_page, frag->f_offset, frag_off);
+
                /* XXX needs + offset for multiple recvs per page */
-               rds_stats_add(s_copy_to_user, to_copy);
-               ret = copy_page_to_iter(sg_page(&frag->f_sg),
-                                       frag->f_sg.offset + frag_off,
-                                       to_copy,
-                                       to);
-               if (ret != to_copy)
-                       return -EFAULT;
+               ret = rds_page_copy_to_user(frag->f_page,
+                                           frag->f_offset + frag_off,
+                                           iov->iov_base + iov_off,
+                                           to_copy);
+               if (ret) {
+                       copied = ret;
+                       break;
+               }
 
+               iov_off += to_copy;
                frag_off += to_copy;
                copied += to_copy;
        }
@@ -583,7 +427,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
 {
        atomic64_set(&ic->i_ack_next, seq);
        if (ack_required) {
-               smp_mb__before_atomic();
+               smp_mb__before_clear_bit();
                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
        }
 }
@@ -591,7 +435,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
 static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
 {
        clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-       smp_mb__after_atomic();
+       smp_mb__after_clear_bit();
 
        return atomic64_read(&ic->i_ack_next);
 }
@@ -623,8 +467,8 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 
                rds_ib_stats_inc(s_ib_ack_send_failure);
-
-               rds_ib_conn_error(ic->conn, "sending ack failed\n");
+               /* Need to finesse this later. */
+               BUG();
        } else
                rds_ib_stats_inc(s_ib_ack_sent);
 }
@@ -680,7 +524,7 @@ void rds_ib_attempt_ack(struct rds_ib_connection *ic)
        }
 
        /* Can we get a send credit? */
-       if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+       if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
                rds_ib_stats_inc(s_ib_tx_throttle);
                clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
                return;
@@ -752,7 +596,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
                to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
                BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
 
-               addr = kmap_atomic(sg_page(&frag->f_sg));
+               addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
 
                src = addr + frag_off;
                dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -762,7 +606,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
                        uncongested |= ~(*src) & *dst;
                        *dst++ = *src++;
                }
-               kunmap_atomic(addr);
+               kunmap_atomic(addr, KM_SOFTIRQ0);
 
                copied += to_copy;
 
@@ -801,7 +645,7 @@ struct rds_ib_ack_state {
 };
 
 static void rds_ib_process_recv(struct rds_connection *conn,
-                               struct rds_ib_recv_work *recv, u32 data_len,
+                               struct rds_ib_recv_work *recv, u32 byte_len,
                                struct rds_ib_ack_state *state)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
@@ -811,17 +655,17 @@ static void rds_ib_process_recv(struct rds_connection *conn,
        /* XXX shut down the connection if port 0,0 are seen? */
 
        rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
-                data_len);
+                byte_len);
 
-       if (data_len < sizeof(struct rds_header)) {
+       if (byte_len < sizeof(struct rds_header)) {
                rds_ib_conn_error(conn, "incoming message "
-                      "from %pI4 didn't include a "
+                      "from %pI4 didn't inclue a "
                       "header, disconnecting and "
                       "reconnecting\n",
                       &conn->c_faddr);
                return;
        }
-       data_len -= sizeof(struct rds_header);
+       byte_len -= sizeof(struct rds_header);
 
        ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
 
@@ -843,7 +687,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
        if (ihdr->h_credit)
                rds_ib_send_add_credits(conn, ihdr->h_credit);
 
-       if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
+       if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
                /* This is an ACK-only packet. The fact that it gets
                 * special treatment here is that historically, ACKs
                 * were rather special beasts.
@@ -855,12 +699,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
                 * the inc is freed.  We don't go that route, so we have to drop the
                 * page ref ourselves.  We can't just leave the page on the recv
                 * because that confuses the dma mapping of pages and each recv's use
-                * of a partial page.
+                * of a partial page.  We can leave the frag, though, it will be
+                * reused.
                 *
                 * FIXME: Fold this into the code path below.
                 */
-               rds_ib_frag_free(ic, recv->r_frag);
-               recv->r_frag = NULL;
+               rds_ib_frag_drop_page(recv->r_frag);
                return;
        }
 
@@ -870,7 +714,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
         * into the inc and save the inc so we can hang upcoming fragments
         * off its list.
         */
-       if (!ibinc) {
+       if (ibinc == NULL) {
                ibinc = recv->r_ibinc;
                recv->r_ibinc = NULL;
                ic->i_ibinc = ibinc;
@@ -885,10 +729,10 @@ static void rds_ib_process_recv(struct rds_connection *conn,
                hdr = &ibinc->ii_inc.i_hdr;
                /* We can't just use memcmp here; fragments of a
                 * single message may carry different ACKs */
-               if (hdr->h_sequence != ihdr->h_sequence ||
-                   hdr->h_len != ihdr->h_len ||
-                   hdr->h_sport != ihdr->h_sport ||
-                   hdr->h_dport != ihdr->h_dport) {
+               if (hdr->h_sequence != ihdr->h_sequence
+                || hdr->h_len != ihdr->h_len
+                || hdr->h_sport != ihdr->h_sport
+                || hdr->h_dport != ihdr->h_dport) {
                        rds_ib_conn_error(conn,
                                "fragment header mismatch; forcing reconnect\n");
                        return;
@@ -908,7 +752,8 @@ static void rds_ib_process_recv(struct rds_connection *conn,
                        rds_ib_cong_recv(conn, ibinc);
                else {
                        rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
-                                         &ibinc->ii_inc, GFP_ATOMIC);
+                                         &ibinc->ii_inc, GFP_ATOMIC,
+                                         KM_SOFTIRQ0);
                        state->ack_next = be64_to_cpu(hdr->h_sequence);
                        state->ack_next_valid = 1;
                }
@@ -938,67 +783,45 @@ void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
 {
        struct rds_connection *conn = context;
        struct rds_ib_connection *ic = conn->c_transport_data;
+       struct ib_wc wc;
+       struct rds_ib_ack_state state = { 0, };
+       struct rds_ib_recv_work *recv;
 
        rdsdebug("conn %p cq %p\n", conn, cq);
 
        rds_ib_stats_inc(s_ib_rx_cq_call);
 
-       tasklet_schedule(&ic->i_recv_tasklet);
-}
+       ib_req_notify_cq(cq, IB_CQ_SOLICITED);
 
-static inline void rds_poll_cq(struct rds_ib_connection *ic,
-                              struct rds_ib_ack_state *state)
-{
-       struct rds_connection *conn = ic->conn;
-       struct ib_wc wc;
-       struct rds_ib_recv_work *recv;
-
-       while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
-               rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
-                        (unsigned long long)wc.wr_id, wc.status,
-                        rds_ib_wc_status_str(wc.status), wc.byte_len,
+       while (ib_poll_cq(cq, 1, &wc) > 0) {
+               rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+                        (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
                         be32_to_cpu(wc.ex.imm_data));
                rds_ib_stats_inc(s_ib_rx_cq_event);
 
                recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
 
-               ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+               rds_ib_recv_unmap_page(ic, recv);
 
                /*
                 * Also process recvs in connecting state because it is possible
                 * to get a recv completion _before_ the rdmacm ESTABLISHED
                 * event is processed.
                 */
-               if (wc.status == IB_WC_SUCCESS) {
-                       rds_ib_process_recv(conn, recv, wc.byte_len, state);
-               } else {
+               if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
                        /* We expect errors as the qp is drained during shutdown */
-                       if (rds_conn_up(conn) || rds_conn_connecting(conn))
-                               rds_ib_conn_error(conn, "recv completion on %pI4 had "
-                                                 "status %u (%s), disconnecting and "
-                                                 "reconnecting\n", &conn->c_faddr,
-                                                 wc.status,
-                                                 rds_ib_wc_status_str(wc.status));
+                       if (wc.status == IB_WC_SUCCESS) {
+                               rds_ib_process_recv(conn, recv, wc.byte_len, &state);
+                       } else {
+                               rds_ib_conn_error(conn, "recv completion on "
+                                      "%pI4 had status %u, disconnecting and "
+                                      "reconnecting\n", &conn->c_faddr,
+                                      wc.status);
+                       }
                }
 
-               /*
-                * It's very important that we only free this ring entry if we've truly
-                * freed the resources allocated to the entry.  The refilling path can
-                * leak if we don't.
-                */
                rds_ib_ring_free(&ic->i_recv_ring, 1);
        }
-}
-
-void rds_ib_recv_tasklet_fn(unsigned long data)
-{
-       struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
-       struct rds_connection *conn = ic->conn;
-       struct rds_ib_ack_state state = { 0, };
-
-       rds_poll_cq(ic, &state);
-       ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
-       rds_poll_cq(ic, &state);
 
        if (state.ack_next_valid)
                rds_ib_set_ack(ic, state.ack_next, state.ack_required);
@@ -1015,8 +838,11 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
        if (rds_ib_ring_empty(&ic->i_recv_ring))
                rds_ib_stats_inc(s_ib_rx_ring_empty);
 
+       /*
+        * If the ring is running low, then schedule the thread to refill.
+        */
        if (rds_ib_ring_low(&ic->i_recv_ring))
-               rds_ib_recv_refill(conn, 0);
+               queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
 }
 
 int rds_ib_recv(struct rds_connection *conn)
@@ -1025,13 +851,25 @@ int rds_ib_recv(struct rds_connection *conn)
        int ret = 0;
 
        rdsdebug("conn %p\n", conn);
+
+       /*
+        * If we get a temporary posting failure in this context then
+        * we're really low and we want the caller to back off for a bit.
+        */
+       mutex_lock(&ic->i_recv_mutex);
+       if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+               ret = -ENOMEM;
+       else
+               rds_ib_stats_inc(s_ib_rx_refill_from_thread);
+       mutex_unlock(&ic->i_recv_mutex);
+
        if (rds_conn_up(conn))
                rds_ib_attempt_ack(ic);
 
        return ret;
 }
 
-int rds_ib_recv_init(void)
+int __init rds_ib_recv_init(void)
 {
        struct sysinfo si;
        int ret = -ENOMEM;
@@ -1042,14 +880,14 @@ int rds_ib_recv_init(void)
 
        rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
                                        sizeof(struct rds_ib_incoming),
-                                       0, SLAB_HWCACHE_ALIGN, NULL);
-       if (!rds_ib_incoming_slab)
+                                       0, 0, NULL);
+       if (rds_ib_incoming_slab == NULL)
                goto out;
 
        rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
                                        sizeof(struct rds_page_frag),
-                                       0, SLAB_HWCACHE_ALIGN, NULL);
-       if (!rds_ib_frag_slab)
+                                       0, 0, NULL);
+       if (rds_ib_frag_slab == NULL)
                kmem_cache_destroy(rds_ib_incoming_slab);
        else
                ret = 0;
index ff97e8eda858bbb2621c1108ab660ecbc0a6fddc..99a6ccae964cbd0e4fd5eccbaa9c38665be26f19 100644 (file)
@@ -137,7 +137,7 @@ int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
 
 int rds_ib_ring_low(struct rds_ib_work_ring *ring)
 {
-       return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
+       return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2);
 }
 
 /*
index bd3825d38abc923bd905b6af266b4fffe706f427..cb6c52cb1c4c1aa47a085e2d71bdf1b61cf2dfa7 100644 (file)
 #include <linux/in.h>
 #include <linux/device.h>
 #include <linux/dmapool.h>
-#include <linux/ratelimit.h>
 
 #include "rds.h"
+#include "rdma.h"
 #include "ib.h"
 
-static char *rds_ib_wc_status_strings[] = {
-#define RDS_IB_WC_STATUS_STR(foo) \
-               [IB_WC_##foo] = __stringify(IB_WC_##foo)
-       RDS_IB_WC_STATUS_STR(SUCCESS),
-       RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
-       RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
-       RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
-       RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
-       RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
-       RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
-       RDS_IB_WC_STATUS_STR(REM_OP_ERR),
-       RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
-       RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
-       RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
-       RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
-       RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
-       RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
-       RDS_IB_WC_STATUS_STR(FATAL_ERR),
-       RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
-       RDS_IB_WC_STATUS_STR(GENERAL_ERR),
-#undef RDS_IB_WC_STATUS_STR
-};
-
-char *rds_ib_wc_status_str(enum ib_wc_status status)
-{
-       return rds_str_array(rds_ib_wc_status_strings,
-                            ARRAY_SIZE(rds_ib_wc_status_strings), status);
-}
-
-/*
- * Convert IB-specific error message to RDS error message and call core
- * completion handler.
- */
-static void rds_ib_send_complete(struct rds_message *rm,
-                                int wc_status,
-                                void (*complete)(struct rds_message *rm, int status))
+static void rds_ib_send_rdma_complete(struct rds_message *rm,
+                                     int wc_status)
 {
        int notify_status;
 
@@ -99,124 +60,69 @@ static void rds_ib_send_complete(struct rds_message *rm,
                notify_status = RDS_RDMA_OTHER_ERROR;
                break;
        }
-       complete(rm, notify_status);
-}
-
-static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
-                                  struct rm_data_op *op,
-                                  int wc_status)
-{
-       if (op->op_nents)
-               ib_dma_unmap_sg(ic->i_cm_id->device,
-                               op->op_sg, op->op_nents,
-                               DMA_TO_DEVICE);
+       rds_rdma_send_complete(rm, notify_status);
 }
 
 static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
-                                  struct rm_rdma_op *op,
-                                  int wc_status)
+                                  struct rds_rdma_op *op)
 {
-       if (op->op_mapped) {
+       if (op->r_mapped) {
                ib_dma_unmap_sg(ic->i_cm_id->device,
-                               op->op_sg, op->op_nents,
-                               op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               op->op_mapped = 0;
+                       op->r_sg, op->r_nents,
+                       op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               op->r_mapped = 0;
        }
-
-       /* If the user asked for a completion notification on this
-        * message, we can implement three different semantics:
-        *  1.  Notify when we received the ACK on the RDS message
-        *      that was queued with the RDMA. This provides reliable
-        *      notification of RDMA status at the expense of a one-way
-        *      packet delay.
-        *  2.  Notify when the IB stack gives us the completion event for
-        *      the RDMA operation.
-        *  3.  Notify when the IB stack gives us the completion event for
-        *      the accompanying RDS messages.
-        * Here, we implement approach #3. To implement approach #2,
-        * we would need to take an event for the rdma WR. To implement #1,
-        * don't call rds_rdma_send_complete at all, and fall back to the notify
-        * handling in the ACK processing code.
-        *
-        * Note: There's no need to explicitly sync any RDMA buffers using
-        * ib_dma_sync_sg_for_cpu - the completion for the RDMA
-        * operation itself unmapped the RDMA buffers, which takes care
-        * of synching.
-        */
-       rds_ib_send_complete(container_of(op, struct rds_message, rdma),
-                            wc_status, rds_rdma_send_complete);
-
-       if (op->op_write)
-               rds_stats_add(s_send_rdma_bytes, op->op_bytes);
-       else
-               rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
 }
 
-static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
-                                    struct rm_atomic_op *op,
-                                    int wc_status)
+static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
+                         struct rds_ib_send_work *send,
+                         int wc_status)
 {
-       /* unmap atomic recvbuf */
-       if (op->op_mapped) {
-               ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
-                               DMA_FROM_DEVICE);
-               op->op_mapped = 0;
-       }
-
-       rds_ib_send_complete(container_of(op, struct rds_message, atomic),
-                            wc_status, rds_atomic_send_complete);
-
-       if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
-               rds_ib_stats_inc(s_ib_atomic_cswp);
-       else
-               rds_ib_stats_inc(s_ib_atomic_fadd);
-}
+       struct rds_message *rm = send->s_rm;
+
+       rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+       ib_dma_unmap_sg(ic->i_cm_id->device,
+                    rm->m_sg, rm->m_nents,
+                    DMA_TO_DEVICE);
+
+       if (rm->m_rdma_op != NULL) {
+               rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
+
+               /* If the user asked for a completion notification on this
+                * message, we can implement three different semantics:
+                *  1.  Notify when we received the ACK on the RDS message
+                *      that was queued with the RDMA. This provides reliable
+                *      notification of RDMA status at the expense of a one-way
+                *      packet delay.
+                *  2.  Notify when the IB stack gives us the completion event for
+                *      the RDMA operation.
+                *  3.  Notify when the IB stack gives us the completion event for
+                *      the accompanying RDS messages.
+                * Here, we implement approach #3. To implement approach #2,
+                * call rds_rdma_send_complete from the cq_handler. To implement #1,
+                * don't call rds_rdma_send_complete at all, and fall back to the notify
+                * handling in the ACK processing code.
+                *
+                * Note: There's no need to explicitly sync any RDMA buffers using
+                * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+                * operation itself unmapped the RDMA buffers, which takes care
+                * of synching.
+                */
+               rds_ib_send_rdma_complete(rm, wc_status);
 
-/*
- * Unmap the resources associated with a struct send_work.
- *
- * Returns the rm for no good reason other than it is unobtainable
- * other than by switching on wr.opcode, currently, and the caller,
- * the event handler, needs it.
- */
-static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
-                                               struct rds_ib_send_work *send,
-                                               int wc_status)
-{
-       struct rds_message *rm = NULL;
-
-       /* In the error case, wc.opcode sometimes contains garbage */
-       switch (send->s_wr.opcode) {
-       case IB_WR_SEND:
-               if (send->s_op) {
-                       rm = container_of(send->s_op, struct rds_message, data);
-                       rds_ib_send_unmap_data(ic, send->s_op, wc_status);
-               }
-               break;
-       case IB_WR_RDMA_WRITE:
-       case IB_WR_RDMA_READ:
-               if (send->s_op) {
-                       rm = container_of(send->s_op, struct rds_message, rdma);
-                       rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
-               }
-               break;
-       case IB_WR_ATOMIC_FETCH_AND_ADD:
-       case IB_WR_ATOMIC_CMP_AND_SWP:
-               if (send->s_op) {
-                       rm = container_of(send->s_op, struct rds_message, atomic);
-                       rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
-               }
-               break;
-       default:
-               printk_ratelimited(KERN_NOTICE
-                              "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
-                              __func__, send->s_wr.opcode);
-               break;
+               if (rm->m_rdma_op->r_write)
+                       rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+               else
+                       rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
        }
 
-       send->s_wr.opcode = 0xdead;
+       /* If anyone waited for this message to get flushed out, wake
+        * them up now */
+       rds_message_unmapped(rm);
 
-       return rm;
+       rds_message_put(rm);
+       send->s_rm = NULL;
 }
 
 void rds_ib_send_init_ring(struct rds_ib_connection *ic)
@@ -227,18 +133,23 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
        for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
                struct ib_sge *sge;
 
+               send->s_rm = NULL;
                send->s_op = NULL;
 
                send->s_wr.wr_id = i;
                send->s_wr.sg_list = send->s_sge;
+               send->s_wr.num_sge = 1;
+               send->s_wr.opcode = IB_WR_SEND;
+               send->s_wr.send_flags = 0;
                send->s_wr.ex.imm_data = 0;
 
-               sge = &send->s_sge[0];
+               sge = rds_ib_data_sge(ic, send->s_sge);
+               sge->lkey = ic->i_mr->lkey;
+
+               sge = rds_ib_header_sge(ic, send->s_sge);
                sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
                sge->length = sizeof(struct rds_header);
                sge->lkey = ic->i_mr->lkey;
-
-               send->s_sge[1].lkey = ic->i_mr->lkey;
        }
 }
 
@@ -248,23 +159,15 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
        u32 i;
 
        for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
-               if (send->s_op && send->s_wr.opcode != 0xdead)
-                       rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
+               if (send->s_wr.opcode == 0xdead)
+                       continue;
+               if (send->s_rm)
+                       rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+               if (send->s_op)
+                       rds_ib_send_unmap_rdma(ic, send->s_op);
        }
 }
 
-/*
- * The only fast path caller always has a non-zero nr, so we don't
- * bother testing nr before performing the atomic sub.
- */
-static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
-{
-       if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
-           waitqueue_active(&rds_ib_ring_empty_wait))
-               wake_up(&rds_ib_ring_empty_wait);
-       BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
-}
-
 /*
  * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
  * operations performed in the send path.  As the sender allocs and potentially
@@ -275,14 +178,12 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 {
        struct rds_connection *conn = context;
        struct rds_ib_connection *ic = conn->c_transport_data;
-       struct rds_message *rm = NULL;
        struct ib_wc wc;
        struct rds_ib_send_work *send;
        u32 completed;
        u32 oldest;
        u32 i = 0;
        int ret;
-       int nr_sig = 0;
 
        rdsdebug("cq %p conn %p\n", cq, conn);
        rds_ib_stats_inc(s_ib_tx_cq_call);
@@ -291,14 +192,13 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
                rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
 
        while (ib_poll_cq(cq, 1, &wc) > 0) {
-               rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
-                        (unsigned long long)wc.wr_id, wc.status,
-                        rds_ib_wc_status_str(wc.status), wc.byte_len,
+               rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+                        (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
                         be32_to_cpu(wc.ex.imm_data));
                rds_ib_stats_inc(s_ib_tx_cq_event);
 
                if (wc.wr_id == RDS_IB_ACK_WR_ID) {
-                       if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+                       if (ic->i_ack_queued + HZ/2 < jiffies)
                                rds_ib_stats_inc(s_ib_tx_stalled);
                        rds_ib_ack_send_complete(ic);
                        continue;
@@ -310,41 +210,58 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
                for (i = 0; i < completed; i++) {
                        send = &ic->i_sends[oldest];
-                       if (send->s_wr.send_flags & IB_SEND_SIGNALED)
-                               nr_sig++;
 
-                       rm = rds_ib_send_unmap_op(ic, send, wc.status);
+                       /* In the error case, wc.opcode sometimes contains garbage */
+                       switch (send->s_wr.opcode) {
+                       case IB_WR_SEND:
+                               if (send->s_rm)
+                                       rds_ib_send_unmap_rm(ic, send, wc.status);
+                               break;
+                       case IB_WR_RDMA_WRITE:
+                       case IB_WR_RDMA_READ:
+                               /* Nothing to be done - the SG list will be unmapped
+                                * when the SEND completes. */
+                               break;
+                       default:
+                               if (printk_ratelimit())
+                                       printk(KERN_NOTICE
+                                               "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+                                               __func__, send->s_wr.opcode);
+                               break;
+                       }
 
-                       if (time_after(jiffies, send->s_queued + HZ/2))
+                       send->s_wr.opcode = 0xdead;
+                       send->s_wr.num_sge = 1;
+                       if (send->s_queued + HZ/2 < jiffies)
                                rds_ib_stats_inc(s_ib_tx_stalled);
 
-                       if (send->s_op) {
-                               if (send->s_op == rm->m_final_op) {
-                                       /* If anyone waited for this message to get flushed out, wake
-                                        * them up now */
-                                       rds_message_unmapped(rm);
-                               }
-                               rds_message_put(rm);
-                               send->s_op = NULL;
+                       /* If a RDMA operation produced an error, signal this right
+                        * away. If we don't, the subsequent SEND that goes with this
+                        * RDMA will be canceled with ERR_WFLUSH, and the application
+                        * never learn that the RDMA failed. */
+                       if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+                               struct rds_message *rm;
+
+                               rm = rds_send_get_message(conn, send->s_op);
+                               if (rm)
+                                       rds_ib_send_rdma_complete(rm, wc.status);
                        }
 
                        oldest = (oldest + 1) % ic->i_send_ring.w_nr;
                }
 
                rds_ib_ring_free(&ic->i_send_ring, completed);
-               rds_ib_sub_signaled(ic, nr_sig);
-               nr_sig = 0;
 
-               if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
-                   test_bit(0, &conn->c_map_queued))
+               if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+                || test_bit(0, &conn->c_map_queued))
                        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
 
                /* We expect errors as the qp is drained during shutdown */
                if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-                       rds_ib_conn_error(conn, "send completion on %pI4 had status "
-                                         "%u (%s), disconnecting and reconnecting\n",
-                                         &conn->c_faddr, wc.status,
-                                         rds_ib_wc_status_str(wc.status));
+                       rds_ib_conn_error(conn,
+                               "send completion on %pI4 "
+                               "had status %u, disconnecting and reconnecting\n",
+                               &conn->c_faddr, wc.status);
                }
        }
 }
@@ -355,7 +272,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
  *
  * Conceptually, we have two counters:
  *  -  send credits: this tells us how many WRs we're allowed
- *     to submit without overruning the receiver's queue. For
+ *     to submit without overruning the reciever's queue. For
  *     each SEND WR we post, we decrement this by one.
  *
  *  -  posted credits: this tells us how many WRs we recently
@@ -373,7 +290,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
  * credits (see rds_ib_send_add_credits below).
  *
  * The RDS send code is essentially single-threaded; rds_send_xmit
- * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
+ * grabs c_send_lock to ensure exclusive access to the send ring.
  * However, the ACK sending code is independent and can race with
  * message SENDs.
  *
@@ -394,7 +311,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
  * and using atomic_cmpxchg when updating the two counters.
  */
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
-                            u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+                            u32 wanted, u32 *adv_credits, int need_posted)
 {
        unsigned int avail, posted, got = 0, advertise;
        long oldval, newval;
@@ -409,7 +326,7 @@ try_again:
        posted = IB_GET_POST_CREDITS(oldval);
        avail = IB_GET_SEND_CREDITS(oldval);
 
-       rdsdebug("wanted=%u credits=%u posted=%u\n",
+       rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
                        wanted, avail, posted);
 
        /* The last credit must be used to send a credit update. */
@@ -434,7 +351,7 @@ try_again:
         * available.
         */
        if (posted && (got || need_posted)) {
-               advertise = min_t(unsigned int, posted, max_posted);
+               advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
                newval -= IB_SET_POST_CREDITS(advertise);
        }
 
@@ -453,7 +370,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
        if (credits == 0)
                return;
 
-       rdsdebug("credits=%u current=%u%s\n",
+       rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
                        credits,
                        IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
                        test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
@@ -492,21 +409,40 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 }
 
-static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
-                                            struct rds_ib_send_work *send,
-                                            bool notify)
+static inline void
+rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
+               struct rds_ib_send_work *send, unsigned int pos,
+               unsigned long buffer, unsigned int length,
+               int send_flags)
 {
-       /*
-        * We want to delay signaling completions just enough to get
-        * the batching benefits but not so much that we create dead time
-        * on the wire.
-        */
-       if (ic->i_unsignaled_wrs-- == 0 || notify) {
-               ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-               send->s_wr.send_flags |= IB_SEND_SIGNALED;
-               return 1;
+       struct ib_sge *sge;
+
+       WARN_ON(pos != send - ic->i_sends);
+
+       send->s_wr.send_flags = send_flags;
+       send->s_wr.opcode = IB_WR_SEND;
+       send->s_wr.num_sge = 2;
+       send->s_wr.next = NULL;
+       send->s_queued = jiffies;
+       send->s_op = NULL;
+
+       if (length != 0) {
+               sge = rds_ib_data_sge(ic, send->s_sge);
+               sge->addr = buffer;
+               sge->length = length;
+               sge->lkey = ic->i_mr->lkey;
+
+               sge = rds_ib_header_sge(ic, send->s_sge);
+       } else {
+               /* We're sending a packet with no payload. There is only
+                * one SGE */
+               send->s_wr.num_sge = 1;
+               sge = &send->s_sge[0];
        }
-       return 0;
+
+       sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+       sge->length = sizeof(struct rds_header);
+       sge->lkey = ic->i_mr->lkey;
 }
 
 /*
@@ -535,27 +471,17 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
        u32 pos;
        u32 i;
        u32 work_alloc;
-       u32 credit_alloc = 0;
+       u32 credit_alloc;
        u32 posted;
        u32 adv_credits = 0;
        int send_flags = 0;
-       int bytes_sent = 0;
+       int sent;
        int ret;
        int flow_controlled = 0;
-       int nr_sig = 0;
 
        BUG_ON(off % RDS_FRAG_SIZE);
        BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
 
-       /* Do not send cong updates to IB loopback */
-       if (conn->c_loopback
-           && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
-               rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
-               scat = &rm->data.op_sg[sg];
-               ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length);
-               return sizeof(struct rds_header) + ret;
-       }
-
        /* FIXME we may overallocate here */
        if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
                i = 1;
@@ -570,16 +496,17 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                goto out;
        }
 
+       credit_alloc = work_alloc;
        if (ic->i_flowctl) {
-               credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+               credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0);
                adv_credits += posted;
                if (credit_alloc < work_alloc) {
                        rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
                        work_alloc = credit_alloc;
-                       flow_controlled = 1;
+                       flow_controlled++;
                }
                if (work_alloc == 0) {
-                       set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+                       rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
                        rds_ib_stats_inc(s_ib_tx_throttle);
                        ret = -ENOMEM;
                        goto out;
@@ -587,25 +514,31 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
        }
 
        /* map the message the first time we see it */
-       if (!ic->i_data_op) {
-               if (rm->data.op_nents) {
-                       rm->data.op_count = ib_dma_map_sg(dev,
-                                                         rm->data.op_sg,
-                                                         rm->data.op_nents,
-                                                         DMA_TO_DEVICE);
-                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
-                       if (rm->data.op_count == 0) {
+       if (ic->i_rm == NULL) {
+               /*
+               printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
+                               be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+                               rm->m_inc.i_hdr.h_flags,
+                               be32_to_cpu(rm->m_inc.i_hdr.h_len));
+                  */
+               if (rm->m_nents) {
+                       rm->m_count = ib_dma_map_sg(dev,
+                                        rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+                       if (rm->m_count == 0) {
                                rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
                                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
                                ret = -ENOMEM; /* XXX ? */
                                goto out;
                        }
                } else {
-                       rm->data.op_count = 0;
+                       rm->m_count = 0;
                }
 
+               ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+               ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
                rds_message_addref(rm);
-               ic->i_data_op = &rm->data;
+               ic->i_rm = rm;
 
                /* Finalize the header */
                if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
@@ -615,10 +548,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
                /* If it has a RDMA op, tell the peer we did it. This is
                 * used by the peer to release use-once RDMA MRs. */
-               if (rm->rdma.op_active) {
+               if (rm->m_rdma_op) {
                        struct rds_ext_header_rdma ext_hdr;
 
-                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
                        rds_message_add_extension(&rm->m_inc.i_hdr,
                                        RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
                }
@@ -638,12 +571,18 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                /*
                 * Update adv_credits since we reset the ACK_REQUIRED bit.
                 */
-               if (ic->i_flowctl) {
-                       rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
-                       adv_credits += posted;
-                       BUG_ON(adv_credits > 255);
-               }
-       }
+               rds_ib_send_grab_credits(ic, 0, &posted, 1);
+               adv_credits += posted;
+               BUG_ON(adv_credits > 255);
+       } else if (ic->i_rm != rm)
+               BUG();
+
+       send = &ic->i_sends[pos];
+       first = send;
+       prev = NULL;
+       scat = &rm->m_sg[sg];
+       sent = 0;
+       i = 0;
 
        /* Sometimes you want to put a fence between an RDMA
         * READ and the following SEND.
@@ -651,64 +590,81 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
         * or when requested by the user. Right now, we let
         * the application choose.
         */
-       if (rm->rdma.op_active && rm->rdma.op_fence)
+       if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
                send_flags = IB_SEND_FENCE;
 
-       /* Each frag gets a header. Msgs may be 0 bytes */
-       send = &ic->i_sends[pos];
-       first = send;
-       prev = NULL;
-       scat = &ic->i_data_op->op_sg[sg];
-       i = 0;
-       do {
-               unsigned int len = 0;
-
-               /* Set up the header */
-               send->s_wr.send_flags = send_flags;
-               send->s_wr.opcode = IB_WR_SEND;
-               send->s_wr.num_sge = 1;
-               send->s_wr.next = NULL;
-               send->s_queued = jiffies;
-               send->s_op = NULL;
+       /*
+        * We could be copying the header into the unused tail of the page.
+        * That would need to be changed in the future when those pages might
+        * be mapped userspace pages or page cache pages.  So instead we always
+        * use a second sge and our long-lived ring of mapped headers.  We send
+        * the header after the data so that the data payload can be aligned on
+        * the receiver.
+        */
 
-               send->s_sge[0].addr = ic->i_send_hdrs_dma
-                       + (pos * sizeof(struct rds_header));
-               send->s_sge[0].length = sizeof(struct rds_header);
+       /* handle a 0-len message */
+       if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+               rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+               goto add_header;
+       }
 
-               memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+       /* if there's data reference it with a chain of work reqs */
+       for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+               unsigned int len;
 
-               /* Set up the data, if present */
-               if (i < work_alloc
-                   && scat != &rm->data.op_sg[rm->data.op_count]) {
-                       len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
-                       send->s_wr.num_sge = 2;
+               send = &ic->i_sends[pos];
 
-                       send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
-                       send->s_sge[1].length = len;
+               len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+               rds_ib_xmit_populate_wr(ic, send, pos,
+                               ib_sg_dma_address(dev, scat) + off, len,
+                               send_flags);
 
-                       bytes_sent += len;
-                       off += len;
-                       if (off == ib_sg_dma_len(dev, scat)) {
-                               scat++;
-                               off = 0;
-                       }
+               /*
+                * We want to delay signaling completions just enough to get
+                * the batching benefits but not so much that we create dead time
+                * on the wire.
+                */
+               if (ic->i_unsignaled_wrs-- == 0) {
+                       ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+                       send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
                }
 
-               rds_ib_set_wr_signal_state(ic, send, 0);
+               ic->i_unsignaled_bytes -= len;
+               if (ic->i_unsignaled_bytes <= 0) {
+                       ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
+                       send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+               }
 
                /*
                 * Always signal the last one if we're stopping due to flow control.
                 */
-               if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
+               if (flow_controlled && i == (work_alloc-1))
                        send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 
-               if (send->s_wr.send_flags & IB_SEND_SIGNALED)
-                       nr_sig++;
-
                rdsdebug("send %p wr %p num_sge %u next %p\n", send,
                         &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
 
-               if (ic->i_flowctl && adv_credits) {
+               sent += len;
+               off += len;
+               if (off == ib_sg_dma_len(dev, scat)) {
+                       scat++;
+                       off = 0;
+               }
+
+add_header:
+               /* Tack on the header after the data. The header SGE should already
+                * have been set up to point to the right header buffer. */
+               memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+               if (0) {
+                       struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+                       printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+                               be16_to_cpu(hdr->h_dport),
+                               hdr->h_flags,
+                               be32_to_cpu(hdr->h_len));
+               }
+               if (adv_credits) {
                        struct rds_header *hdr = &ic->i_send_hdrs[pos];
 
                        /* add credit and redo the header checksum */
@@ -723,25 +679,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                prev = send;
 
                pos = (pos + 1) % ic->i_send_ring.w_nr;
-               send = &ic->i_sends[pos];
-               i++;
-
-       } while (i < work_alloc
-                && scat != &rm->data.op_sg[rm->data.op_count]);
+       }
 
        /* Account the RDS header in the number of bytes we sent, but just once.
         * The caller has no concept of fragmentation. */
        if (hdr_off == 0)
-               bytes_sent += sizeof(struct rds_header);
+               sent += sizeof(struct rds_header);
 
        /* if we finished the message then send completion owns it */
-       if (scat == &rm->data.op_sg[rm->data.op_count]) {
-               prev->s_op = ic->i_data_op;
-               prev->s_wr.send_flags |= IB_SEND_SOLICITED;
-               ic->i_data_op = NULL;
+       if (scat == &rm->m_sg[rm->m_count]) {
+               prev->s_rm = ic->i_rm;
+               prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+               ic->i_rm = NULL;
        }
 
-       /* Put back wrs & credits we didn't use */
        if (i < work_alloc) {
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
                work_alloc = i;
@@ -749,9 +700,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
        if (ic->i_flowctl && i < credit_alloc)
                rds_ib_send_add_credits(conn, credit_alloc - i);
 
-       if (nr_sig)
-               atomic_add(nr_sig, &ic->i_signaled_sends);
-
        /* XXX need to worry about failed_wr and partial sends. */
        failed_wr = &first->s_wr;
        ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
@@ -762,127 +710,32 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
                       "returned %d\n", &conn->c_faddr, ret);
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_sub_signaled(ic, nr_sig);
-               if (prev->s_op) {
-                       ic->i_data_op = prev->s_op;
-                       prev->s_op = NULL;
+               if (prev->s_rm) {
+                       ic->i_rm = prev->s_rm;
+                       prev->s_rm = NULL;
                }
-
-               rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
+               /* Finesse this later */
+               BUG();
                goto out;
        }
 
-       ret = bytes_sent;
+       ret = sent;
 out:
        BUG_ON(adv_credits);
        return ret;
 }
 
-/*
- * Issue atomic operation.
- * A simplified version of the rdma case, we always map 1 SG, and
- * only 8 bytes, for the return value from the atomic operation.
- */
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
-{
-       struct rds_ib_connection *ic = conn->c_transport_data;
-       struct rds_ib_send_work *send = NULL;
-       struct ib_send_wr *failed_wr;
-       struct rds_ib_device *rds_ibdev;
-       u32 pos;
-       u32 work_alloc;
-       int ret;
-       int nr_sig = 0;
-
-       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
-       work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
-       if (work_alloc != 1) {
-               rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_stats_inc(s_ib_tx_ring_full);
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /* address of send request in ring */
-       send = &ic->i_sends[pos];
-       send->s_queued = jiffies;
-
-       if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
-               send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
-               send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
-               send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
-               send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
-               send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
-       } else { /* FADD */
-               send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
-               send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
-               send->s_wr.wr.atomic.swap = 0;
-               send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
-               send->s_wr.wr.atomic.swap_mask = 0;
-       }
-       nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
-       send->s_wr.num_sge = 1;
-       send->s_wr.next = NULL;
-       send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
-       send->s_wr.wr.atomic.rkey = op->op_rkey;
-       send->s_op = op;
-       rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
-
-       /* map 8 byte retval buffer to the device */
-       ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
-       rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
-       if (ret != 1) {
-               rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
-               ret = -ENOMEM; /* XXX ? */
-               goto out;
-       }
-
-       /* Convert our struct scatterlist to struct ib_sge */
-       send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
-       send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
-       send->s_sge[0].lkey = ic->i_mr->lkey;
-
-       rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
-                send->s_sge[0].addr, send->s_sge[0].length);
-
-       if (nr_sig)
-               atomic_add(nr_sig, &ic->i_signaled_sends);
-
-       failed_wr = &send->s_wr;
-       ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
-       rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
-                send, &send->s_wr, ret, failed_wr);
-       BUG_ON(failed_wr != &send->s_wr);
-       if (ret) {
-               printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
-                      "returned %d\n", &conn->c_faddr, ret);
-               rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_sub_signaled(ic, nr_sig);
-               goto out;
-       }
-
-       if (unlikely(failed_wr != &send->s_wr)) {
-               printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
-               BUG_ON(failed_wr != &send->s_wr);
-       }
-
-out:
-       return ret;
-}
-
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
        struct rds_ib_send_work *send = NULL;
        struct rds_ib_send_work *first;
        struct rds_ib_send_work *prev;
        struct ib_send_wr *failed_wr;
+       struct rds_ib_device *rds_ibdev;
        struct scatterlist *scat;
        unsigned long len;
-       u64 remote_addr = op->op_remote_addr;
-       u32 max_sge = ic->rds_ibdev->max_sge;
+       u64 remote_addr = op->r_remote_addr;
        u32 pos;
        u32 work_alloc;
        u32 i;
@@ -890,28 +743,29 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
        int sent;
        int ret;
        int num_sge;
-       int nr_sig = 0;
-
-       /* map the op the first time we see it */
-       if (!op->op_mapped) {
-               op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
-                                            op->op_sg, op->op_nents, (op->op_write) ?
-                                            DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
-               if (op->op_count == 0) {
+
+       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+       /* map the message the first time we see it */
+       if (!op->r_mapped) {
+               op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+                                       op->r_sg, op->r_nents, (op->r_write) ?
+                                       DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+               if (op->r_count == 0) {
                        rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
                        ret = -ENOMEM; /* XXX ? */
                        goto out;
                }
 
-               op->op_mapped = 1;
+               op->r_mapped = 1;
        }
 
        /*
         * Instead of knowing how to return a partial rdma read/write we insist that there
         * be enough work requests to send the entire message.
         */
-       i = ceil(op->op_count, max_sge);
+       i = ceil(op->r_count, rds_ibdev->max_sge);
 
        work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
        if (work_alloc != i) {
@@ -924,24 +778,30 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
        send = &ic->i_sends[pos];
        first = send;
        prev = NULL;
-       scat = &op->op_sg[0];
+       scat = &op->r_sg[0];
        sent = 0;
-       num_sge = op->op_count;
+       num_sge = op->r_count;
 
-       for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+       for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
                send->s_wr.send_flags = 0;
                send->s_queued = jiffies;
-               send->s_op = NULL;
-
-               nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+               /*
+                * We want to delay signaling completions just enough to get
+                * the batching benefits but not so much that we create dead time on the wire.
+                */
+               if (ic->i_unsignaled_wrs-- == 0) {
+                       ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+                       send->s_wr.send_flags = IB_SEND_SIGNALED;
+               }
 
-               send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+               send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
                send->s_wr.wr.rdma.remote_addr = remote_addr;
-               send->s_wr.wr.rdma.rkey = op->op_rkey;
+               send->s_wr.wr.rdma.rkey = op->r_key;
+               send->s_op = op;
 
-               if (num_sge > max_sge) {
-                       send->s_wr.num_sge = max_sge;
-                       num_sge -= max_sge;
+               if (num_sge > rds_ibdev->max_sge) {
+                       send->s_wr.num_sge = rds_ibdev->max_sge;
+                       num_sge -= rds_ibdev->max_sge;
                } else {
                        send->s_wr.num_sge = num_sge;
                }
@@ -951,7 +811,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                if (prev)
                        prev->s_wr.next = &send->s_wr;
 
-               for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+               for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
                        len = ib_sg_dma_len(ic->i_cm_id->device, scat);
                        send->s_sge[j].addr =
                                 ib_sg_dma_address(ic->i_cm_id->device, scat);
@@ -973,20 +833,15 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                        send = ic->i_sends;
        }
 
-       /* give a reference to the last op */
-       if (scat == &op->op_sg[op->op_count]) {
-               prev->s_op = op;
-               rds_message_addref(container_of(op, struct rds_message, rdma));
-       }
+       /* if we finished the message then send completion owns it */
+       if (scat == &op->r_sg[op->r_count])
+               prev->s_wr.send_flags = IB_SEND_SIGNALED;
 
        if (i < work_alloc) {
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
                work_alloc = i;
        }
 
-       if (nr_sig)
-               atomic_add(nr_sig, &ic->i_signaled_sends);
-
        failed_wr = &first->s_wr;
        ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
        rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
@@ -996,7 +851,6 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
                       "returned %d\n", &conn->c_faddr, ret);
                rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_sub_signaled(ic, nr_sig);
                goto out;
        }
 
index 2d5965d6e97c039517d219bfdad3f28f7437b1d7..02e3e3d50d4a3c952bb0817e0e596a94ccba1c61 100644 (file)
@@ -37,9 +37,9 @@
 #include "rds.h"
 #include "ib.h"
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
+DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
 
-static const char *const rds_ib_stat_names[] = {
+static char *rds_ib_stat_names[] = {
        "ib_connect_raced",
        "ib_listen_closed_stale",
        "ib_tx_cq_call",
@@ -67,8 +67,6 @@ static const char *const rds_ib_stat_names[] = {
        "ib_rdma_mr_pool_flush",
        "ib_rdma_mr_pool_wait",
        "ib_rdma_mr_pool_depleted",
-       "ib_atomic_cswp",
-       "ib_atomic_fadd",
 };
 
 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
index e4e41b3afce7134119dfebd1b22bf8464492a663..d87830db93a0a4d1ee0e3a4ea89199a2a15427f9 100644 (file)
@@ -49,73 +49,89 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
 static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
 static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
 
-/*
- * This sysctl does nothing.
- *
- * Backwards compatibility with RDS 3.0 wire protocol
- * disables initial FC credit exchange.
- * If it's ever possible to drop 3.0 support,
- * setting this to 1 and moving init/refill of send/recv
- * rings from ib_cm_connect_complete() back into ib_setup_qp()
- * will cause credits to be added before protocol negotiation.
- */
-unsigned int rds_ib_sysctl_flow_control = 0;
+unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
 
-static struct ctl_table rds_ib_sysctl_table[] = {
+unsigned int rds_ib_sysctl_flow_control = 1;
+
+ctl_table rds_ib_sysctl_table[] = {
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_send_wr",
                .data           = &rds_ib_sysctl_max_send_wr,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                .extra1         = &rds_ib_sysctl_max_wr_min,
                .extra2         = &rds_ib_sysctl_max_wr_max,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_recv_wr",
                .data           = &rds_ib_sysctl_max_recv_wr,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                .extra1         = &rds_ib_sysctl_max_wr_min,
                .extra2         = &rds_ib_sysctl_max_wr_max,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_unsignaled_wr",
                .data           = &rds_ib_sysctl_max_unsig_wrs,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                .extra1         = &rds_ib_sysctl_max_unsig_wr_min,
                .extra2         = &rds_ib_sysctl_max_unsig_wr_max,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "max_unsignaled_bytes",
+               .data           = &rds_ib_sysctl_max_unsig_bytes,
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = &proc_doulongvec_minmax,
+               .extra1         = &rds_ib_sysctl_max_unsig_bytes_min,
+               .extra2         = &rds_ib_sysctl_max_unsig_bytes_max,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_recv_allocation",
                .data           = &rds_ib_sysctl_max_recv_allocation,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "flow_control",
                .data           = &rds_ib_sysctl_flow_control,
                .maxlen         = sizeof(rds_ib_sysctl_flow_control),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
        },
+       { .ctl_name = 0}
+};
+
+static struct ctl_path rds_ib_sysctl_path[] = {
+       { .procname = "net", .ctl_name = CTL_NET, },
+       { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+       { .procname = "ib", .ctl_name = CTL_UNNUMBERED, },
        { }
 };
 
 void rds_ib_sysctl_exit(void)
 {
        if (rds_ib_sysctl_hdr)
-               unregister_net_sysctl_table(rds_ib_sysctl_hdr);
+               unregister_sysctl_table(rds_ib_sysctl_hdr);
 }
 
-int rds_ib_sysctl_init(void)
+int __init rds_ib_sysctl_init(void)
 {
-       rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table);
-       if (!rds_ib_sysctl_hdr)
+       rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
+       if (rds_ib_sysctl_hdr == NULL)
                return -ENOMEM;
        return 0;
 }
index 9a6b4f66187cf3e5ab533cd01344c9856834ebb7..1d885535214dba2ba7567bed1b005e391fa9d675 100644 (file)
@@ -32,9 +32,7 @@
  */
 #include <linux/percpu.h>
 #include <linux/seq_file.h>
-#include <linux/slab.h>
 #include <linux/proc_fs.h>
-#include <linux/export.h>
 
 #include "rds.h"
 
@@ -77,11 +75,10 @@ void rds_info_register_func(int optname, rds_info_func func)
        BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
 
        spin_lock(&rds_info_lock);
-       BUG_ON(rds_info_funcs[offset]);
+       BUG_ON(rds_info_funcs[offset] != NULL);
        rds_info_funcs[offset] = func;
        spin_unlock(&rds_info_lock);
 }
-EXPORT_SYMBOL_GPL(rds_info_register_func);
 
 void rds_info_deregister_func(int optname, rds_info_func func)
 {
@@ -94,7 +91,6 @@ void rds_info_deregister_func(int optname, rds_info_func func)
        rds_info_funcs[offset] = NULL;
        spin_unlock(&rds_info_lock);
 }
-EXPORT_SYMBOL_GPL(rds_info_deregister_func);
 
 /*
  * Typically we hold an atomic kmap across multiple rds_info_copy() calls
@@ -103,8 +99,8 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
  */
 void rds_info_iter_unmap(struct rds_info_iterator *iter)
 {
-       if (iter->addr) {
-               kunmap_atomic(iter->addr);
+       if (iter->addr != NULL) {
+               kunmap_atomic(iter->addr, KM_USER0);
                iter->addr = NULL;
        }
 }
@@ -118,8 +114,8 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
        unsigned long this;
 
        while (bytes) {
-               if (!iter->addr)
-                       iter->addr = kmap_atomic(*iter->pages);
+               if (iter->addr == NULL)
+                       iter->addr = kmap_atomic(*iter->pages, KM_USER0);
 
                this = min(bytes, PAGE_SIZE - iter->offset);
 
@@ -134,14 +130,13 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
                iter->offset += this;
 
                if (iter->offset == PAGE_SIZE) {
-                       kunmap_atomic(iter->addr);
+                       kunmap_atomic(iter->addr, KM_USER0);
                        iter->addr = NULL;
                        iter->offset = 0;
                        iter->pages++;
                }
        }
 }
-EXPORT_SYMBOL_GPL(rds_info_copy);
 
 /*
  * @optval points to the userspace buffer that the information snapshot
@@ -189,11 +184,14 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
                        >> PAGE_SHIFT;
 
        pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
-       if (!pages) {
+       if (pages == NULL) {
                ret = -ENOMEM;
                goto out;
        }
-       ret = get_user_pages_fast(start, nr_pages, 1, pages);
+       down_read(&current->mm->mmap_sem);
+       ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0,
+                            pages, NULL);
+       up_read(&current->mm->mmap_sem);
        if (ret != nr_pages) {
                if (ret > 0)
                        nr_pages = ret;
@@ -207,7 +205,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 
 call_func:
        func = rds_info_funcs[optname - RDS_INFO_FIRST];
-       if (!func) {
+       if (func == NULL) {
                ret = -ENOPROTOOPT;
                goto out;
        }
@@ -235,7 +233,7 @@ call_func:
                ret = -EFAULT;
 
 out:
-       for (i = 0; pages && i < nr_pages; i++)
+       for (i = 0; pages != NULL && i < nr_pages; i++)
                put_page(pages[i]);
        kfree(pages);
 
index 589935661d667d81b2f6159eb69c237f95329a63..b732efb5b6345b0e5e923388258b500c4eb04208 100644 (file)
@@ -37,8 +37,6 @@
 #include <linux/inetdevice.h>
 #include <linux/if_arp.h>
 #include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
 
 #include "rds.h"
 #include "iw.h"
@@ -57,7 +55,7 @@ struct list_head rds_iw_devices;
 DEFINE_SPINLOCK(iw_nodev_conns_lock);
 LIST_HEAD(iw_nodev_conns);
 
-static void rds_iw_add_one(struct ib_device *device)
+void rds_iw_add_one(struct ib_device *device)
 {
        struct rds_iw_device *rds_iwdev;
        struct ib_device_attr *dev_attr;
@@ -85,16 +83,23 @@ static void rds_iw_add_one(struct ib_device *device)
        rds_iwdev->max_wrs = dev_attr->max_qp_wr;
        rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
 
+       rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
+
        rds_iwdev->dev = device;
        rds_iwdev->pd = ib_alloc_pd(device);
        if (IS_ERR(rds_iwdev->pd))
                goto free_dev;
 
        if (!rds_iwdev->dma_local_lkey) {
-               rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
-                                       IB_ACCESS_REMOTE_READ |
-                                       IB_ACCESS_REMOTE_WRITE |
-                                       IB_ACCESS_LOCAL_WRITE);
+               if (device->node_type != RDMA_NODE_RNIC) {
+                       rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+                                               IB_ACCESS_LOCAL_WRITE);
+               } else {
+                       rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+                                               IB_ACCESS_REMOTE_READ |
+                                               IB_ACCESS_REMOTE_WRITE |
+                                               IB_ACCESS_LOCAL_WRITE);
+               }
                if (IS_ERR(rds_iwdev->mr))
                        goto err_pd;
        } else
@@ -125,7 +130,7 @@ free_attr:
        kfree(dev_attr);
 }
 
-static void rds_iw_remove_one(struct ib_device *device)
+void rds_iw_remove_one(struct ib_device *device)
 {
        struct rds_iw_device *rds_iwdev;
        struct rds_iw_cm_id *i_cm_id, *next;
@@ -186,8 +191,8 @@ static int rds_iw_conn_info_visitor(struct rds_connection *conn,
                ic = conn->c_transport_data;
                dev_addr = &ic->i_cm_id->route.addr.dev_addr;
 
-               rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
-               rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+               ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+               ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
 
                rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
                iinfo->max_send_wr = ic->i_send_ring.w_nr;
@@ -227,9 +232,9 @@ static int rds_iw_laddr_check(__be32 addr)
        /* Create a CMA ID and try to bind it. This catches both
         * IB and iWARP capable NICs.
         */
-       cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
-       if (IS_ERR(cm_id))
-               return PTR_ERR(cm_id);
+       cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+       if (!cm_id)
+               return -EADDRNOTAVAIL;
 
        memset(&sin, 0, sizeof(sin));
        sin.sin_family = AF_INET;
@@ -239,8 +244,7 @@ static int rds_iw_laddr_check(__be32 addr)
        ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
        /* due to this, we will claim to support IB devices unless we
           check node_type. */
-       if (ret || !cm_id->device ||
-           cm_id->device->node_type != RDMA_NODE_RNIC)
+       if (ret || cm_id->device->node_type != RDMA_NODE_RNIC)
                ret = -EADDRNOTAVAIL;
 
        rdsdebug("addr %pI4 ret %d node type %d\n",
@@ -266,6 +270,7 @@ struct rds_transport rds_iw_transport = {
        .laddr_check            = rds_iw_laddr_check,
        .xmit_complete          = rds_iw_xmit_complete,
        .xmit                   = rds_iw_xmit,
+       .xmit_cong_map          = NULL,
        .xmit_rdma              = rds_iw_xmit_rdma,
        .recv                   = rds_iw_recv,
        .conn_alloc             = rds_iw_conn_alloc,
@@ -273,6 +278,7 @@ struct rds_transport rds_iw_transport = {
        .conn_connect           = rds_iw_conn_connect,
        .conn_shutdown          = rds_iw_conn_shutdown,
        .inc_copy_to_user       = rds_iw_inc_copy_to_user,
+       .inc_purge              = rds_iw_inc_purge,
        .inc_free               = rds_iw_inc_free,
        .cm_initiate_connect    = rds_iw_cm_initiate_connect,
        .cm_handle_connect      = rds_iw_cm_handle_connect,
@@ -285,11 +291,10 @@ struct rds_transport rds_iw_transport = {
        .flush_mrs              = rds_iw_flush_mrs,
        .t_owner                = THIS_MODULE,
        .t_name                 = "iwarp",
-       .t_type                 = RDS_TRANS_IWARP,
        .t_prefer_loopback      = 1,
 };
 
-int rds_iw_init(void)
+int __init rds_iw_init(void)
 {
        int ret;
 
index cbe6674e31ee52f6c9fd4380e3f0942127ede7c9..b4fb27252895adb45a1cf635ee441741b392cc5d 100644 (file)
@@ -1,7 +1,6 @@
 #ifndef _RDS_IW_H
 #define _RDS_IW_H
 
-#include <linux/interrupt.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 #include "rds.h"
@@ -71,7 +70,7 @@ struct rds_iw_send_work {
        struct rds_message      *s_rm;
 
        /* We should really put these into a union: */
-       struct rm_rdma_op       *s_op;
+       struct rds_rdma_op      *s_op;
        struct rds_iw_mapping   *s_mapping;
        struct ib_mr            *s_mr;
        struct ib_fast_reg_page_list *s_page_list;
@@ -120,7 +119,6 @@ struct rds_iw_connection {
        struct rds_iw_send_work *i_sends;
 
        /* rx */
-       struct tasklet_struct   i_recv_tasklet;
        struct mutex            i_recv_mutex;
        struct rds_iw_work_ring i_recv_ring;
        struct rds_iw_incoming  *i_iwinc;
@@ -183,6 +181,7 @@ struct rds_iw_device {
        struct ib_pd            *pd;
        struct ib_mr            *mr;
        struct rds_iw_mr_pool   *mr_pool;
+       int                     page_shift;
        int                     max_sge;
        unsigned int            max_wrs;
        unsigned int            dma_local_lkey:1;
@@ -269,6 +268,8 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
 
 /* ib.c */
 extern struct rds_transport rds_iw_transport;
+extern void rds_iw_add_one(struct ib_device *device);
+extern void rds_iw_remove_one(struct ib_device *device);
 extern struct ib_client rds_iw_client;
 
 extern unsigned int fastreg_pool_size;
@@ -283,7 +284,7 @@ void rds_iw_conn_free(void *arg);
 int rds_iw_conn_connect(struct rds_connection *conn);
 void rds_iw_conn_shutdown(struct rds_connection *conn);
 void rds_iw_state_change(struct sock *sk);
-int rds_iw_listen_init(void);
+int __init rds_iw_listen_init(void);
 void rds_iw_listen_stop(void);
 void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -317,17 +318,19 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
 void rds_iw_sync_mr(void *trans_private, int dir);
 void rds_iw_free_mr(void *trans_private, int invalidate);
 void rds_iw_flush_mrs(void);
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
 
 /* ib_recv.c */
-int rds_iw_recv_init(void);
+int __init rds_iw_recv_init(void);
 void rds_iw_recv_exit(void);
 int rds_iw_recv(struct rds_connection *conn);
 int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
                       gfp_t page_gfp, int prefill);
+void rds_iw_inc_purge(struct rds_incoming *inc);
 void rds_iw_inc_free(struct rds_incoming *inc);
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+                            size_t size);
 void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_iw_recv_tasklet_fn(unsigned long data);
 void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
 void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
 void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
@@ -354,11 +357,11 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
 void rds_iw_send_init_ring(struct rds_iw_connection *ic);
 void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
 void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
-                            u32 *adv_credits, int need_posted, int max_posted);
+                            u32 *adv_credits, int need_posted);
 
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
@@ -367,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
                                    unsigned int avail);
 
 /* ib_sysctl.c */
-int rds_iw_sysctl_init(void);
+int __init rds_iw_sysctl_init(void);
 void rds_iw_sysctl_exit(void);
 extern unsigned long rds_iw_sysctl_max_send_wr;
 extern unsigned long rds_iw_sysctl_max_recv_wr;
@@ -375,6 +378,7 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs;
 extern unsigned long rds_iw_sysctl_max_unsig_bytes;
 extern unsigned long rds_iw_sysctl_max_recv_allocation;
 extern unsigned int rds_iw_sysctl_flow_control;
+extern ctl_table rds_iw_sysctl_table[];
 
 /*
  * Helper functions for getting/setting the header and data SGEs in
index a6c2bea9f8f9b37b46ce381336a90fb685187083..a416b0d492b1ce7b08d0cc17158e379cc6f609f8 100644 (file)
@@ -32,9 +32,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/in.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
 
 #include "rds.h"
 #include "iw.h"
@@ -158,11 +156,9 @@ static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
        case IB_EVENT_QP_REQ_ERR:
        case IB_EVENT_QP_FATAL:
        default:
-               rdsdebug("Fatal QP Event %u "
-                       "- connection %pI4->%pI4, reconnecting\n",
+               rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n",
                        event->event, &conn->c_laddr,
                        &conn->c_faddr);
-               rds_conn_drop(conn);
                break;
        }
 }
@@ -182,7 +178,7 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
        unsigned int send_size, recv_size;
        int ret;
 
-       /* The offset of 1 is to accommodate the additional ACK WR. */
+       /* The offset of 1 is to accomodate the additional ACK WR. */
        send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
        recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
        rds_iw_ring_resize(send_ring, send_size - 1);
@@ -258,8 +254,9 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
         * the rds_iwdev at all.
         */
        rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
-       if (!rds_iwdev) {
-               printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
+       if (rds_iwdev == NULL) {
+               if (printk_ratelimit())
+                       printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
                                        dev->name);
                return -EOPNOTSUPP;
        }
@@ -292,7 +289,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
                                           ic->i_send_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_send_hdrs_dma, GFP_KERNEL);
-       if (!ic->i_send_hdrs) {
+       if (ic->i_send_hdrs == NULL) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent send failed\n");
                goto out;
@@ -302,7 +299,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
                                           ic->i_recv_ring.w_nr *
                                                sizeof(struct rds_header),
                                           &ic->i_recv_hdrs_dma, GFP_KERNEL);
-       if (!ic->i_recv_hdrs) {
+       if (ic->i_recv_hdrs == NULL) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent recv failed\n");
                goto out;
@@ -310,14 +307,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 
        ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
                                       &ic->i_ack_dma, GFP_KERNEL);
-       if (!ic->i_ack) {
+       if (ic->i_ack == NULL) {
                ret = -ENOMEM;
                rdsdebug("ib_dma_alloc_coherent ack failed\n");
                goto out;
        }
 
        ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
-       if (!ic->i_sends) {
+       if (ic->i_sends == NULL) {
                ret = -ENOMEM;
                rdsdebug("send allocation failed\n");
                goto out;
@@ -325,7 +322,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
        rds_iw_send_init_ring(ic);
 
        ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
-       if (!ic->i_recvs) {
+       if (ic->i_recvs == NULL) {
                ret = -ENOMEM;
                rdsdebug("recv allocation failed\n");
                goto out;
@@ -365,12 +362,13 @@ static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
                version = RDS_PROTOCOL_3_0;
                while ((common >>= 1) != 0)
                        version++;
-       }
-       printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using "
+       } else if (printk_ratelimit()) {
+               printk(KERN_NOTICE "RDS: Connection from %pI4 using "
                        "incompatible protocol version %u.%u\n",
                        &dp->dp_saddr,
                        dp->dp_protocol_major,
                        dp->dp_protocol_minor);
+       }
        return version;
 }
 
@@ -451,7 +449,6 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
        err = rds_iw_setup_qp(conn);
        if (err) {
                rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
-               mutex_unlock(&conn->c_cm_lock);
                goto out;
        }
 
@@ -521,7 +518,7 @@ int rds_iw_conn_connect(struct rds_connection *conn)
        /* XXX I wonder what affect the port space has */
        /* delegate cm event handler to rdma_transport */
        ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
-                                    RDMA_PS_TCP, IB_QPT_RC);
+                                    RDMA_PS_TCP);
        if (IS_ERR(ic->i_cm_id)) {
                ret = PTR_ERR(ic->i_cm_id);
                ic->i_cm_id = NULL;
@@ -590,8 +587,8 @@ void rds_iw_conn_shutdown(struct rds_connection *conn)
                        /* Actually this may happen quite frequently, when
                         * an outgoing connect raced with an incoming connect.
                         */
-                       rdsdebug("failed to disconnect, cm: %p err %d\n",
-                                ic->i_cm_id, err);
+                       rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
+                                  " cm: %p err %d\n", ic->i_cm_id, err);
                }
 
                if (ic->i_cm_id->qp) {
@@ -694,13 +691,11 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
        unsigned long flags;
 
        /* XXX too lazy? */
-       ic = kzalloc(sizeof(struct rds_iw_connection), gfp);
-       if (!ic)
+       ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
+       if (ic == NULL)
                return -ENOMEM;
 
        INIT_LIST_HEAD(&ic->iw_node);
-       tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
-                    (unsigned long) ic);
        mutex_init(&ic->i_recv_mutex);
 #ifndef KERNEL_HAS_ATOMIC64
        spin_lock_init(&ic->i_ack_lock);
index dba8d0864f18046ee87a168d49cc159518fa2916..dcdb37da80f29bd741c0295b2c0a41577b7532d3 100644 (file)
  *
  */
 #include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/ratelimit.h>
 
 #include "rds.h"
+#include "rdma.h"
 #include "iw.h"
 
 
@@ -84,13 +83,10 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
 static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
 static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
                        struct list_head *unmap_list,
-                       struct list_head *kill_list,
-                       int *unpinned);
+                       struct list_head *kill_list);
 static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
 
-static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
-                            struct rds_iw_device **rds_iwdev,
-                            struct rdma_cm_id **cm_id)
+static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
 {
        struct rds_iw_device *iwdev;
        struct rds_iw_cm_id *i_cm_id;
@@ -114,23 +110,23 @@ static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
                                src_addr->sin_port,
                                dst_addr->sin_addr.s_addr,
                                dst_addr->sin_port,
-                               src->sin_addr.s_addr,
-                               src->sin_port,
-                               dst->sin_addr.s_addr,
-                               dst->sin_port);
+                               rs->rs_bound_addr,
+                               rs->rs_bound_port,
+                               rs->rs_conn_addr,
+                               rs->rs_conn_port);
 #ifdef WORKING_TUPLE_DETECTION
-                       if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr &&
-                           src_addr->sin_port == src->sin_port &&
-                           dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr &&
-                           dst_addr->sin_port == dst->sin_port) {
+                       if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
+                           src_addr->sin_port == rs->rs_bound_port &&
+                           dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
+                           dst_addr->sin_port == rs->rs_conn_port) {
 #else
                        /* FIXME - needs to compare the local and remote
                         * ipaddr/port tuple, but the ipaddr is the only
-                        * available information in the rds_sock (as the rest are
+                        * available infomation in the rds_sock (as the rest are
                         * zero'ed.  It doesn't appear to be properly populated
                         * during connection setup...
                         */
-                       if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) {
+                       if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
 #endif
                                spin_unlock_irq(&iwdev->spinlock);
                                *rds_iwdev = iwdev;
@@ -161,8 +157,7 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *
        return 0;
 }
 
-static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
-                               struct rdma_cm_id *cm_id)
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
 {
        struct rds_iw_cm_id *i_cm_id;
 
@@ -182,13 +177,19 @@ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_i
 {
        struct sockaddr_in *src_addr, *dst_addr;
        struct rds_iw_device *rds_iwdev_old;
+       struct rds_sock rs;
        struct rdma_cm_id *pcm_id;
        int rc;
 
        src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
        dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
 
-       rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id);
+       rs.rs_bound_addr = src_addr->sin_addr.s_addr;
+       rs.rs_bound_port = src_addr->sin_port;
+       rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
+       rs.rs_conn_port = dst_addr->sin_port;
+
+       rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
        if (rc)
                rds_iw_remove_cm_id(rds_iwdev, cm_id);
 
@@ -205,9 +206,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con
        BUG_ON(list_empty(&ic->iw_node));
        list_del(&ic->iw_node);
 
-       spin_lock(&rds_iwdev->spinlock);
+       spin_lock_irq(&rds_iwdev->spinlock);
        list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
-       spin_unlock(&rds_iwdev->spinlock);
+       spin_unlock_irq(&rds_iwdev->spinlock);
        spin_unlock_irq(&iw_nodev_conns_lock);
 
        ic->rds_iwdev = rds_iwdev;
@@ -244,8 +245,11 @@ void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
        INIT_LIST_HEAD(list);
        spin_unlock_irq(list_lock);
 
-       list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
+       list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
+               if (ic->conn->c_passive)
+                       rds_conn_destroy(ic->conn->c_passive);
                rds_conn_destroy(ic->conn);
+       }
 }
 
 static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
@@ -259,12 +263,18 @@ static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
 }
 
 static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
-                       struct rds_iw_scatterlist *sg)
+                       struct rds_iw_scatterlist *sg,
+                       unsigned int dma_page_shift)
 {
        struct ib_device *dev = rds_iwdev->dev;
        u64 *dma_pages = NULL;
+       u64 dma_mask;
+       unsigned int dma_page_size;
        int i, j, ret;
 
+       dma_page_size = 1 << dma_page_shift;
+       dma_mask = dma_page_size - 1;
+
        WARN_ON(sg->dma_len);
 
        sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
@@ -285,18 +295,18 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
                sg->bytes += dma_len;
 
                end_addr = dma_addr + dma_len;
-               if (dma_addr & PAGE_MASK) {
+               if (dma_addr & dma_mask) {
                        if (i > 0)
                                goto out_unmap;
-                       dma_addr &= ~PAGE_MASK;
+                       dma_addr &= ~dma_mask;
                }
-               if (end_addr & PAGE_MASK) {
+               if (end_addr & dma_mask) {
                        if (i < sg->dma_len - 1)
                                goto out_unmap;
-                       end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
+                       end_addr = (end_addr + dma_mask) & ~dma_mask;
                }
 
-               sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
+               sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift;
        }
 
        /* Now gather the dma addrs into one list */
@@ -315,8 +325,8 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
                u64 end_addr;
 
                end_addr = dma_addr + dma_len;
-               dma_addr &= ~PAGE_MASK;
-               for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
+               dma_addr &= ~dma_mask;
+               for (; dma_addr < end_addr; dma_addr += dma_page_size)
                        dma_pages[j++] = dma_addr;
                BUG_ON(j > sg->dma_npages);
        }
@@ -473,6 +483,17 @@ void rds_iw_sync_mr(void *trans_private, int direction)
        }
 }
 
+static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all)
+{
+       unsigned int item_count;
+
+       item_count = atomic_read(&pool->item_count);
+       if (free_all)
+               return item_count;
+
+       return 0;
+}
+
 /*
  * Flush our pool of MRs.
  * At a minimum, all currently unused MRs are unmapped.
@@ -485,7 +506,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
        LIST_HEAD(unmap_list);
        LIST_HEAD(kill_list);
        unsigned long flags;
-       unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
+       unsigned int nfreed = 0, ncleaned = 0, free_goal;
        int ret = 0;
 
        rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
@@ -499,6 +520,8 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
                list_splice_init(&pool->clean_list, &kill_list);
        spin_unlock_irqrestore(&pool->list_lock, flags);
 
+       free_goal = rds_iw_flush_goal(pool, free_all);
+
        /* Batched invalidate of dirty MRs.
         * For FMR based MRs, the mappings on the unmap list are
         * actually members of an ibmr (ibmr->mapping). They either
@@ -508,8 +531,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
         * will be destroyed by the unmap function.
         */
        if (!list_empty(&unmap_list)) {
-               ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
-                                                    &kill_list, &unpinned);
+               ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
                /* If we've been asked to destroy all MRs, move those
                 * that were simply cleaned to the kill list */
                if (free_all)
@@ -533,7 +555,6 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
                spin_unlock_irqrestore(&pool->list_lock, flags);
        }
 
-       atomic_sub(unpinned, &pool->free_pinned);
        atomic_sub(ncleaned, &pool->dirty_count);
        atomic_sub(nfreed, &pool->item_count);
 
@@ -561,8 +582,8 @@ void rds_iw_free_mr(void *trans_private, int invalidate)
        rds_iw_free_fastreg(pool, ibmr);
 
        /* If we've pinned too many pages, request a flush */
-       if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
-           atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+       if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+        || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
                queue_work(rds_wq, &pool->flush_worker);
 
        if (invalidate) {
@@ -594,17 +615,9 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
        struct rds_iw_device *rds_iwdev;
        struct rds_iw_mr *ibmr = NULL;
        struct rdma_cm_id *cm_id;
-       struct sockaddr_in src = {
-               .sin_addr.s_addr = rs->rs_bound_addr,
-               .sin_port = rs->rs_bound_port,
-       };
-       struct sockaddr_in dst = {
-               .sin_addr.s_addr = rs->rs_conn_addr,
-               .sin_port = rs->rs_conn_port,
-       };
        int ret;
 
-       ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id);
+       ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
        if (ret || !cm_id) {
                ret = -ENODEV;
                goto out;
@@ -714,7 +727,7 @@ static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
        f_wr.wr.fast_reg.rkey = mapping->m_rkey;
        f_wr.wr.fast_reg.page_list = ibmr->page_list;
        f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
-       f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+       f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift;
        f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
                                IB_ACCESS_REMOTE_READ |
                                IB_ACCESS_REMOTE_WRITE;
@@ -724,8 +737,8 @@ static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
        failed_wr = &f_wr;
        ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
        BUG_ON(failed_wr != &f_wr);
-       if (ret)
-               printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+       if (ret && printk_ratelimit())
+               printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
                        __func__, __LINE__, ret);
        return ret;
 }
@@ -746,8 +759,8 @@ static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
 
        failed_wr = &s_wr;
        ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
-       if (ret) {
-               printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+       if (ret && printk_ratelimit()) {
+               printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
                        __func__, __LINE__, ret);
                goto out;
        }
@@ -767,7 +780,9 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
 
        rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
 
-       dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
+       dma_pages = rds_iw_map_scatterlist(rds_iwdev,
+                               &mapping->m_sg,
+                               rds_iwdev->page_shift);
        if (IS_ERR(dma_pages)) {
                ret = PTR_ERR(dma_pages);
                dma_pages = NULL;
@@ -822,8 +837,7 @@ static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
 
 static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
                                struct list_head *unmap_list,
-                               struct list_head *kill_list,
-                               int *unpinned)
+                               struct list_head *kill_list)
 {
        struct rds_iw_mapping *mapping, *next;
        unsigned int ncleaned = 0;
@@ -850,7 +864,6 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
 
                spin_lock_irqsave(&pool->list_lock, flags);
                list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
-                       *unpinned += mapping->m_sg.len;
                        list_move(&mapping->m_list, &laundered);
                        ncleaned++;
                }
index a66d1794b2d0472e511a179ae9872c2766fb8dd8..fde470fa50d5457c72226d0336e00aeed8df534d 100644 (file)
@@ -31,7 +31,6 @@
  *
  */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <rdma/rdma_cm.h>
@@ -53,7 +52,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
 static void rds_iw_frag_free(struct rds_page_frag *frag)
 {
        rdsdebug("frag %p page %p\n", frag, frag->f_page);
-       BUG_ON(frag->f_page);
+       BUG_ON(frag->f_page != NULL);
        kmem_cache_free(rds_iw_frag_slab, frag);
 }
 
@@ -143,32 +142,31 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
        struct ib_sge *sge;
        int ret = -ENOMEM;
 
-       if (!recv->r_iwinc) {
-               if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
+       if (recv->r_iwinc == NULL) {
+               if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) {
                        rds_iw_stats_inc(s_iw_rx_alloc_limit);
                        goto out;
                }
                recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
                                                 kptr_gfp);
-               if (!recv->r_iwinc) {
-                       atomic_dec(&rds_iw_allocation);
+               if (recv->r_iwinc == NULL)
                        goto out;
-               }
+               atomic_inc(&rds_iw_allocation);
                INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
                rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
        }
 
-       if (!recv->r_frag) {
+       if (recv->r_frag == NULL) {
                recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
-               if (!recv->r_frag)
+               if (recv->r_frag == NULL)
                        goto out;
                INIT_LIST_HEAD(&recv->r_frag->f_item);
                recv->r_frag->f_page = NULL;
        }
 
-       if (!ic->i_frag.f_page) {
+       if (ic->i_frag.f_page == NULL) {
                ic->i_frag.f_page = alloc_page(page_gfp);
-               if (!ic->i_frag.f_page)
+               if (ic->i_frag.f_page == NULL)
                        goto out;
                ic->i_frag.f_offset = 0;
        }
@@ -231,8 +229,8 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
        int ret = 0;
        u32 pos;
 
-       while ((prefill || rds_conn_up(conn)) &&
-              rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+       while ((prefill || rds_conn_up(conn))
+                       && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
                if (pos >= ic->i_recv_ring.w_nr) {
                        printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
                                        pos);
@@ -273,7 +271,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
        return ret;
 }
 
-static void rds_iw_inc_purge(struct rds_incoming *inc)
+void rds_iw_inc_purge(struct rds_incoming *inc)
 {
        struct rds_iw_incoming *iwinc;
        struct rds_page_frag *frag;
@@ -303,12 +301,15 @@ void rds_iw_inc_free(struct rds_incoming *inc)
        BUG_ON(atomic_read(&rds_iw_allocation) < 0);
 }
 
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+                           size_t size)
 {
        struct rds_iw_incoming *iwinc;
        struct rds_page_frag *frag;
+       struct iovec *iov = first_iov;
        unsigned long to_copy;
        unsigned long frag_off = 0;
+       unsigned long iov_off = 0;
        int copied = 0;
        int ret;
        u32 len;
@@ -317,25 +318,37 @@ int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
        frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
        len = be32_to_cpu(inc->i_hdr.h_len);
 
-       while (iov_iter_count(to) && copied < len) {
+       while (copied < size && copied < len) {
                if (frag_off == RDS_FRAG_SIZE) {
                        frag = list_entry(frag->f_item.next,
                                          struct rds_page_frag, f_item);
                        frag_off = 0;
                }
-               to_copy = min_t(unsigned long, iov_iter_count(to),
-                               RDS_FRAG_SIZE - frag_off);
+               while (iov_off == iov->iov_len) {
+                       iov_off = 0;
+                       iov++;
+               }
+
+               to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+               to_copy = min_t(size_t, to_copy, size - copied);
                to_copy = min_t(unsigned long, to_copy, len - copied);
 
+               rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+                        "[%p, %lu] + %lu\n",
+                        to_copy, iov->iov_base, iov->iov_len, iov_off,
+                        frag->f_page, frag->f_offset, frag_off);
+
                /* XXX needs + offset for multiple recvs per page */
-               rds_stats_add(s_copy_to_user, to_copy);
-               ret = copy_page_to_iter(frag->f_page,
-                                       frag->f_offset + frag_off,
-                                       to_copy,
-                                       to);
-               if (ret != to_copy)
-                       return -EFAULT;
+               ret = rds_page_copy_to_user(frag->f_page,
+                                           frag->f_offset + frag_off,
+                                           iov->iov_base + iov_off,
+                                           to_copy);
+               if (ret) {
+                       copied = ret;
+                       break;
+               }
 
+               iov_off += to_copy;
                frag_off += to_copy;
                copied += to_copy;
        }
@@ -414,7 +427,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
 {
        atomic64_set(&ic->i_ack_next, seq);
        if (ack_required) {
-               smp_mb__before_atomic();
+               smp_mb__before_clear_bit();
                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
        }
 }
@@ -422,7 +435,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
 static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
 {
        clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-       smp_mb__after_atomic();
+       smp_mb__after_clear_bit();
 
        return atomic64_read(&ic->i_ack_next);
 }
@@ -454,8 +467,8 @@ static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credi
                set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 
                rds_iw_stats_inc(s_iw_ack_send_failure);
-
-               rds_iw_conn_error(ic->conn, "sending ack failed\n");
+               /* Need to finesse this later. */
+               BUG();
        } else
                rds_iw_stats_inc(s_iw_ack_sent);
 }
@@ -511,7 +524,7 @@ void rds_iw_attempt_ack(struct rds_iw_connection *ic)
        }
 
        /* Can we get a send credit? */
-       if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+       if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
                rds_iw_stats_inc(s_iw_tx_throttle);
                clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
                return;
@@ -583,7 +596,7 @@ static void rds_iw_cong_recv(struct rds_connection *conn,
                to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
                BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
 
-               addr = kmap_atomic(frag->f_page);
+               addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
 
                src = addr + frag_off;
                dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -593,7 +606,7 @@ static void rds_iw_cong_recv(struct rds_connection *conn,
                        uncongested |= ~(*src) & *dst;
                        *dst++ = *src++;
                }
-               kunmap_atomic(addr);
+               kunmap_atomic(addr, KM_SOFTIRQ0);
 
                copied += to_copy;
 
@@ -646,7 +659,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
 
        if (byte_len < sizeof(struct rds_header)) {
                rds_iw_conn_error(conn, "incoming message "
-                      "from %pI4 didn't include a "
+                      "from %pI4 didn't inclue a "
                       "header, disconnecting and "
                       "reconnecting\n",
                       &conn->c_faddr);
@@ -701,7 +714,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
         * into the inc and save the inc so we can hang upcoming fragments
         * off its list.
         */
-       if (!iwinc) {
+       if (iwinc == NULL) {
                iwinc = recv->r_iwinc;
                recv->r_iwinc = NULL;
                ic->i_iwinc = iwinc;
@@ -716,10 +729,10 @@ static void rds_iw_process_recv(struct rds_connection *conn,
                hdr = &iwinc->ii_inc.i_hdr;
                /* We can't just use memcmp here; fragments of a
                 * single message may carry different ACKs */
-               if (hdr->h_sequence != ihdr->h_sequence ||
-                   hdr->h_len != ihdr->h_len ||
-                   hdr->h_sport != ihdr->h_sport ||
-                   hdr->h_dport != ihdr->h_dport) {
+               if (hdr->h_sequence != ihdr->h_sequence
+                || hdr->h_len != ihdr->h_len
+                || hdr->h_sport != ihdr->h_sport
+                || hdr->h_dport != ihdr->h_dport) {
                        rds_iw_conn_error(conn,
                                "fragment header mismatch; forcing reconnect\n");
                        return;
@@ -739,7 +752,8 @@ static void rds_iw_process_recv(struct rds_connection *conn,
                        rds_iw_cong_recv(conn, iwinc);
                else {
                        rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
-                                         &iwinc->ii_inc, GFP_ATOMIC);
+                                         &iwinc->ii_inc, GFP_ATOMIC,
+                                         KM_SOFTIRQ0);
                        state->ack_next = be64_to_cpu(hdr->h_sequence);
                        state->ack_next_valid = 1;
                }
@@ -769,22 +783,17 @@ void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
 {
        struct rds_connection *conn = context;
        struct rds_iw_connection *ic = conn->c_transport_data;
+       struct ib_wc wc;
+       struct rds_iw_ack_state state = { 0, };
+       struct rds_iw_recv_work *recv;
 
        rdsdebug("conn %p cq %p\n", conn, cq);
 
        rds_iw_stats_inc(s_iw_rx_cq_call);
 
-       tasklet_schedule(&ic->i_recv_tasklet);
-}
-
-static inline void rds_poll_cq(struct rds_iw_connection *ic,
-                              struct rds_iw_ack_state *state)
-{
-       struct rds_connection *conn = ic->conn;
-       struct ib_wc wc;
-       struct rds_iw_recv_work *recv;
+       ib_req_notify_cq(cq, IB_CQ_SOLICITED);
 
-       while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
+       while (ib_poll_cq(cq, 1, &wc) > 0) {
                rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
                         (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
                         be32_to_cpu(wc.ex.imm_data));
@@ -802,7 +811,7 @@ static inline void rds_poll_cq(struct rds_iw_connection *ic,
                if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
                        /* We expect errors as the qp is drained during shutdown */
                        if (wc.status == IB_WC_SUCCESS) {
-                               rds_iw_process_recv(conn, recv, wc.byte_len, state);
+                               rds_iw_process_recv(conn, recv, wc.byte_len, &state);
                        } else {
                                rds_iw_conn_error(conn, "recv completion on "
                                       "%pI4 had status %u, disconnecting and "
@@ -813,17 +822,6 @@ static inline void rds_poll_cq(struct rds_iw_connection *ic,
 
                rds_iw_ring_free(&ic->i_recv_ring, 1);
        }
-}
-
-void rds_iw_recv_tasklet_fn(unsigned long data)
-{
-       struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
-       struct rds_connection *conn = ic->conn;
-       struct rds_iw_ack_state state = { 0, };
-
-       rds_poll_cq(ic, &state);
-       ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
-       rds_poll_cq(ic, &state);
 
        if (state.ack_next_valid)
                rds_iw_set_ack(ic, state.ack_next, state.ack_required);
@@ -871,7 +869,7 @@ int rds_iw_recv(struct rds_connection *conn)
        return ret;
 }
 
-int rds_iw_recv_init(void)
+int __init rds_iw_recv_init(void)
 {
        struct sysinfo si;
        int ret = -ENOMEM;
@@ -883,13 +881,13 @@ int rds_iw_recv_init(void)
        rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
                                        sizeof(struct rds_iw_incoming),
                                        0, 0, NULL);
-       if (!rds_iw_incoming_slab)
+       if (rds_iw_incoming_slab == NULL)
                goto out;
 
        rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
                                        sizeof(struct rds_page_frag),
                                        0, 0, NULL);
-       if (!rds_iw_frag_slab)
+       if (rds_iw_frag_slab == NULL)
                kmem_cache_destroy(rds_iw_incoming_slab);
        else
                ret = 0;
index da8e3b63f66363c67a73700ba356179bbf2fbfc7..d422d4b5deef2b49c183d6030db61d09a98a8729 100644 (file)
@@ -137,7 +137,7 @@ int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
 
 int rds_iw_ring_low(struct rds_iw_work_ring *ring)
 {
-       return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
+       return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2);
 }
 
 
index 13834780a3089e9e640e470f7b2e8b26c6334b7b..22dd38ffd6080843afd8ddc3987ff2a4490369eb 100644 (file)
@@ -34,9 +34,9 @@
 #include <linux/in.h>
 #include <linux/device.h>
 #include <linux/dmapool.h>
-#include <linux/ratelimit.h>
 
 #include "rds.h"
+#include "rdma.h"
 #include "iw.h"
 
 static void rds_iw_send_rdma_complete(struct rds_message *rm,
@@ -64,13 +64,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
 }
 
 static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
-                                  struct rm_rdma_op *op)
+                                  struct rds_rdma_op *op)
 {
-       if (op->op_mapped) {
+       if (op->r_mapped) {
                ib_dma_unmap_sg(ic->i_cm_id->device,
-                       op->op_sg, op->op_nents,
-                       op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               op->op_mapped = 0;
+                       op->r_sg, op->r_nents,
+                       op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               op->r_mapped = 0;
        }
 }
 
@@ -83,11 +83,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
        rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
 
        ib_dma_unmap_sg(ic->i_cm_id->device,
-                    rm->data.op_sg, rm->data.op_nents,
+                    rm->m_sg, rm->m_nents,
                     DMA_TO_DEVICE);
 
-       if (rm->rdma.op_active) {
-               rds_iw_send_unmap_rdma(ic, &rm->rdma);
+       if (rm->m_rdma_op != NULL) {
+               rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
 
                /* If the user asked for a completion notification on this
                 * message, we can implement three different semantics:
@@ -111,10 +111,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
                 */
                rds_iw_send_rdma_complete(rm, wc_status);
 
-               if (rm->rdma.op_write)
-                       rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
+               if (rm->m_rdma_op->r_write)
+                       rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
                else
-                       rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
+                       rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
        }
 
        /* If anyone waited for this message to get flushed out, wake
@@ -232,7 +232,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
                }
 
                if (wc.wr_id == RDS_IW_ACK_WR_ID) {
-                       if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+                       if (ic->i_ack_queued + HZ/2 < jiffies)
                                rds_iw_stats_inc(s_iw_tx_stalled);
                        rds_iw_ack_send_complete(ic);
                        continue;
@@ -259,7 +259,8 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
                                 * when the SEND completes. */
                                break;
                        default:
-                               printk_ratelimited(KERN_NOTICE
+                               if (printk_ratelimit())
+                                       printk(KERN_NOTICE
                                                "RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
                                                __func__, send->s_wr.opcode);
                                break;
@@ -267,7 +268,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
                        send->s_wr.opcode = 0xdead;
                        send->s_wr.num_sge = 1;
-                       if (time_after(jiffies, send->s_queued + HZ/2))
+                       if (send->s_queued + HZ/2 < jiffies)
                                rds_iw_stats_inc(s_iw_tx_stalled);
 
                        /* If a RDMA operation produced an error, signal this right
@@ -287,8 +288,8 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
                rds_iw_ring_free(&ic->i_send_ring, completed);
 
-               if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
-                   test_bit(0, &conn->c_map_queued))
+               if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+                || test_bit(0, &conn->c_map_queued))
                        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
 
                /* We expect errors as the qp is drained during shutdown */
@@ -307,7 +308,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
  *
  * Conceptually, we have two counters:
  *  -  send credits: this tells us how many WRs we're allowed
- *     to submit without overruning the receiver's queue. For
+ *     to submit without overruning the reciever's queue. For
  *     each SEND WR we post, we decrement this by one.
  *
  *  -  posted credits: this tells us how many WRs we recently
@@ -346,7 +347,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
  * and using atomic_cmpxchg when updating the two counters.
  */
 int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
-                            u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+                            u32 wanted, u32 *adv_credits, int need_posted)
 {
        unsigned int avail, posted, got = 0, advertise;
        long oldval, newval;
@@ -361,7 +362,7 @@ try_again:
        posted = IB_GET_POST_CREDITS(oldval);
        avail = IB_GET_SEND_CREDITS(oldval);
 
-       rdsdebug("wanted=%u credits=%u posted=%u\n",
+       rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
                        wanted, avail, posted);
 
        /* The last credit must be used to send a credit update. */
@@ -386,7 +387,7 @@ try_again:
         * available.
         */
        if (posted && (got || need_posted)) {
-               advertise = min_t(unsigned int, posted, max_posted);
+               advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
                newval -= IB_SET_POST_CREDITS(advertise);
        }
 
@@ -405,7 +406,7 @@ void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
        if (credits == 0)
                return;
 
-       rdsdebug("credits=%u current=%u%s\n",
+       rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
                        credits,
                        IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
                        test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
@@ -518,7 +519,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
        BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
 
        /* Fastreg support */
-       if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
+       if (rds_rdma_cookie_key(rm->m_rdma_cookie)
+        && !ic->i_fastreg_posted) {
                ret = -EAGAIN;
                goto out;
        }
@@ -539,7 +541,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 
        credit_alloc = work_alloc;
        if (ic->i_flowctl) {
-               credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+               credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0);
                adv_credits += posted;
                if (credit_alloc < work_alloc) {
                        rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
@@ -547,7 +549,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
                        flow_controlled++;
                }
                if (work_alloc == 0) {
-                       set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+                       rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
                        rds_iw_stats_inc(s_iw_tx_throttle);
                        ret = -ENOMEM;
                        goto out;
@@ -555,27 +557,25 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
        }
 
        /* map the message the first time we see it */
-       if (!ic->i_rm) {
+       if (ic->i_rm == NULL) {
                /*
                printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
                                be16_to_cpu(rm->m_inc.i_hdr.h_dport),
                                rm->m_inc.i_hdr.h_flags,
                                be32_to_cpu(rm->m_inc.i_hdr.h_len));
                   */
-               if (rm->data.op_nents) {
-                       rm->data.op_count = ib_dma_map_sg(dev,
-                                                         rm->data.op_sg,
-                                                         rm->data.op_nents,
-                                                         DMA_TO_DEVICE);
-                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
-                       if (rm->data.op_count == 0) {
+               if (rm->m_nents) {
+                       rm->m_count = ib_dma_map_sg(dev,
+                                        rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+                       if (rm->m_count == 0) {
                                rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
                                rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
                                ret = -ENOMEM; /* XXX ? */
                                goto out;
                        }
                } else {
-                       rm->data.op_count = 0;
+                       rm->m_count = 0;
                }
 
                ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
@@ -591,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 
                /* If it has a RDMA op, tell the peer we did it. This is
                 * used by the peer to release use-once RDMA MRs. */
-               if (rm->rdma.op_active) {
+               if (rm->m_rdma_op) {
                        struct rds_ext_header_rdma ext_hdr;
 
-                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
                        rds_message_add_extension(&rm->m_inc.i_hdr,
                                        RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
                }
@@ -614,15 +614,16 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
                /*
                 * Update adv_credits since we reset the ACK_REQUIRED bit.
                 */
-               rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+               rds_iw_send_grab_credits(ic, 0, &posted, 1);
                adv_credits += posted;
                BUG_ON(adv_credits > 255);
-       }
+       } else if (ic->i_rm != rm)
+               BUG();
 
        send = &ic->i_sends[pos];
        first = send;
        prev = NULL;
-       scat = &rm->data.op_sg[sg];
+       scat = &rm->m_sg[sg];
        sent = 0;
        i = 0;
 
@@ -632,7 +633,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
         * or when requested by the user. Right now, we let
         * the application choose.
         */
-       if (rm->rdma.op_active && rm->rdma.op_fence)
+       if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
                send_flags = IB_SEND_FENCE;
 
        /*
@@ -651,7 +652,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
        }
 
        /* if there's data reference it with a chain of work reqs */
-       for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
+       for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
                unsigned int len;
 
                send = &ic->i_sends[pos];
@@ -729,7 +730,7 @@ add_header:
                sent += sizeof(struct rds_header);
 
        /* if we finished the message then send completion owns it */
-       if (scat == &rm->data.op_sg[rm->data.op_count]) {
+       if (scat == &rm->m_sg[rm->m_count]) {
                prev->s_rm = ic->i_rm;
                prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
                ic->i_rm = NULL;
@@ -778,14 +779,14 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
        send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
        send->s_wr.wr.fast_reg.page_list = send->s_page_list;
        send->s_wr.wr.fast_reg.page_list_len = nent;
-       send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+       send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift;
        send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
        send->s_wr.wr.fast_reg.iova_start = sg_addr;
 
        ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
 }
 
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 {
        struct rds_iw_connection *ic = conn->c_transport_data;
        struct rds_iw_send_work *send = NULL;
@@ -795,7 +796,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
        struct rds_iw_device *rds_iwdev;
        struct scatterlist *scat;
        unsigned long len;
-       u64 remote_addr = op->op_remote_addr;
+       u64 remote_addr = op->r_remote_addr;
        u32 pos, fr_pos;
        u32 work_alloc;
        u32 i;
@@ -807,21 +808,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
        rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
 
        /* map the message the first time we see it */
-       if (!op->op_mapped) {
-               op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
-                                            op->op_sg, op->op_nents, (op->op_write) ?
-                                            DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
-               if (op->op_count == 0) {
+       if (!op->r_mapped) {
+               op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+                                       op->r_sg, op->r_nents, (op->r_write) ?
+                                       DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+               if (op->r_count == 0) {
                        rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
                        ret = -ENOMEM; /* XXX ? */
                        goto out;
                }
 
-               op->op_mapped = 1;
+               op->r_mapped = 1;
        }
 
-       if (!op->op_write) {
+       if (!op->r_write) {
                /* Alloc space on the send queue for the fastreg */
                work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
                if (work_alloc != 1) {
@@ -836,7 +837,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
         * Instead of knowing how to return a partial rdma read/write we insist that there
         * be enough work requests to send the entire message.
         */
-       i = ceil(op->op_count, rds_iwdev->max_sge);
+       i = ceil(op->r_count, rds_iwdev->max_sge);
 
        work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
        if (work_alloc != i) {
@@ -847,17 +848,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
        }
 
        send = &ic->i_sends[pos];
-       if (!op->op_write) {
+       if (!op->r_write) {
                first = prev = &ic->i_sends[fr_pos];
        } else {
                first = send;
                prev = NULL;
        }
-       scat = &op->op_sg[0];
+       scat = &op->r_sg[0];
        sent = 0;
-       num_sge = op->op_count;
+       num_sge = op->r_count;
 
-       for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+       for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
                send->s_wr.send_flags = 0;
                send->s_queued = jiffies;
 
@@ -874,13 +875,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                 * for local access after RDS is finished with it, using
                 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
                 */
-               if (op->op_write)
+               if (op->r_write)
                        send->s_wr.opcode = IB_WR_RDMA_WRITE;
                else
                        send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
 
                send->s_wr.wr.rdma.remote_addr = remote_addr;
-               send->s_wr.wr.rdma.rkey = op->op_rkey;
+               send->s_wr.wr.rdma.rkey = op->r_key;
                send->s_op = op;
 
                if (num_sge > rds_iwdev->max_sge) {
@@ -894,7 +895,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                if (prev)
                        prev->s_wr.next = &send->s_wr;
 
-               for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+               for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
                        len = ib_sg_dma_len(ic->i_cm_id->device, scat);
 
                        if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
@@ -928,7 +929,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
        }
 
        /* if we finished the message then send completion owns it */
-       if (scat == &op->op_sg[op->op_count])
+       if (scat == &op->r_sg[op->r_count])
                first->s_wr.send_flags = IB_SEND_SIGNALED;
 
        if (i < work_alloc) {
@@ -942,9 +943,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
         * adapters do not allow using the lkey for this at all.  To bypass this use a
         * fastreg_mr (or possibly a dma_mr)
         */
-       if (!op->op_write) {
+       if (!op->r_write) {
                rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
-                       op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+                       op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
                work_alloc++;
        }
 
index 5fe67f6a1d8060f8312abf785a5631a68bec6d44..ccc7e8f0bf0e09253be5cff4a5fd3002e6388b8b 100644 (file)
@@ -37,9 +37,9 @@
 #include "rds.h"
 #include "iw.h"
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
+DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
 
-static const char *const rds_iw_stat_names[] = {
+static char *rds_iw_stat_names[] = {
        "iw_connect_raced",
        "iw_listen_closed_stale",
        "iw_tx_cq_call",
index 139239d2cb228438e29f347b33035da15d5396c0..9590678cd616837e7699c80bc3ec1bb5c8c09f9c 100644 (file)
@@ -55,69 +55,83 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
 
 unsigned int rds_iw_sysctl_flow_control = 1;
 
-static struct ctl_table rds_iw_sysctl_table[] = {
+ctl_table rds_iw_sysctl_table[] = {
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_send_wr",
                .data           = &rds_iw_sysctl_max_send_wr,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                .extra1         = &rds_iw_sysctl_max_wr_min,
                .extra2         = &rds_iw_sysctl_max_wr_max,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_recv_wr",
                .data           = &rds_iw_sysctl_max_recv_wr,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                .extra1         = &rds_iw_sysctl_max_wr_min,
                .extra2         = &rds_iw_sysctl_max_wr_max,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_unsignaled_wr",
                .data           = &rds_iw_sysctl_max_unsig_wrs,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                .extra1         = &rds_iw_sysctl_max_unsig_wr_min,
                .extra2         = &rds_iw_sysctl_max_unsig_wr_max,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_unsignaled_bytes",
                .data           = &rds_iw_sysctl_max_unsig_bytes,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                .extra1         = &rds_iw_sysctl_max_unsig_bytes_min,
                .extra2         = &rds_iw_sysctl_max_unsig_bytes_max,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_recv_allocation",
                .data           = &rds_iw_sysctl_max_recv_allocation,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "flow_control",
                .data           = &rds_iw_sysctl_flow_control,
                .maxlen         = sizeof(rds_iw_sysctl_flow_control),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
        },
+       { .ctl_name = 0}
+};
+
+static struct ctl_path rds_iw_sysctl_path[] = {
+       { .procname = "net", .ctl_name = CTL_NET, },
+       { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+       { .procname = "iw", .ctl_name = CTL_UNNUMBERED, },
        { }
 };
 
 void rds_iw_sysctl_exit(void)
 {
-       unregister_net_sysctl_table(rds_iw_sysctl_hdr);
+       if (rds_iw_sysctl_hdr)
+               unregister_sysctl_table(rds_iw_sysctl_hdr);
 }
 
-int rds_iw_sysctl_init(void)
+int __init rds_iw_sysctl_init(void)
 {
-       rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
-       if (!rds_iw_sysctl_hdr)
+       rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
+       if (rds_iw_sysctl_hdr == NULL)
                return -ENOMEM;
        return 0;
 }
index 6b12b68541ae96fb8be76e72cb8d0e6f8c89abee..4a61997f554db1108c69cd619662526d80079f0a 100644 (file)
@@ -31,7 +31,6 @@
  *
  */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/in.h>
 
 #include "rds.h"
@@ -61,42 +60,39 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
                         unsigned int hdr_off, unsigned int sg,
                         unsigned int off)
 {
-       struct scatterlist *sgp = &rm->data.op_sg[sg];
-       int ret = sizeof(struct rds_header) +
-                       be32_to_cpu(rm->m_inc.i_hdr.h_len);
-
-       /* Do not send cong updates to loopback */
-       if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
-               rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
-               ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
-               goto out;
-       }
-
        BUG_ON(hdr_off || sg || off);
 
        rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
-       /* For the embedded inc. Matching put is in loop_inc_free() */
-       rds_message_addref(rm);
+       rds_message_addref(rm); /* for the inc */
 
        rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
-                         GFP_KERNEL);
+                         GFP_KERNEL, KM_USER0);
 
        rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
                            NULL);
 
        rds_inc_put(&rm->m_inc);
-out:
-       return ret;
+
+       return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
 }
 
-/*
- * See rds_loop_xmit(). Since our inc is embedded in the rm, we
- * make sure the rm lives at least until the inc is done.
- */
-static void rds_loop_inc_free(struct rds_incoming *inc)
+static int rds_loop_xmit_cong_map(struct rds_connection *conn,
+                                 struct rds_cong_map *map,
+                                 unsigned long offset)
 {
-        struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
-        rds_message_put(rm);
+       unsigned long i;
+
+       BUG_ON(offset);
+       BUG_ON(map != conn->c_lcong);
+
+       for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+               memcpy((void *)conn->c_fcong->m_page_addrs[i],
+                      (void *)map->m_page_addrs[i], PAGE_SIZE);
+       }
+
+       rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+
+       return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
 }
 
 /* we need to at least give the thread something to succeed */
@@ -121,8 +117,8 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
        struct rds_loop_connection *lc;
        unsigned long flags;
 
-       lc = kzalloc(sizeof(struct rds_loop_connection), gfp);
-       if (!lc)
+       lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
+       if (lc == NULL)
                return -ENOMEM;
 
        INIT_LIST_HEAD(&lc->loop_node);
@@ -139,12 +135,8 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 static void rds_loop_conn_free(void *arg)
 {
        struct rds_loop_connection *lc = arg;
-       unsigned long flags;
-
        rdsdebug("lc %p\n", lc);
-       spin_lock_irqsave(&loop_conns_lock, flags);
        list_del(&lc->loop_node);
-       spin_unlock_irqrestore(&loop_conns_lock, flags);
        kfree(lc);
 }
 
@@ -183,12 +175,14 @@ void rds_loop_exit(void)
  */
 struct rds_transport rds_loop_transport = {
        .xmit                   = rds_loop_xmit,
+       .xmit_cong_map          = rds_loop_xmit_cong_map,
        .recv                   = rds_loop_recv,
        .conn_alloc             = rds_loop_conn_alloc,
        .conn_free              = rds_loop_conn_free,
        .conn_connect           = rds_loop_conn_connect,
        .conn_shutdown          = rds_loop_conn_shutdown,
        .inc_copy_to_user       = rds_message_inc_copy_to_user,
-       .inc_free               = rds_loop_inc_free,
+       .inc_purge              = rds_message_inc_purge,
+       .inc_free               = rds_message_inc_free,
        .t_name                 = "loopback",
 };
index 756c73729126d45c18a29bd5859aa04596a1ed0f..5a15dc8d0cd78bf63a27dfc0d9816477fa1959b8 100644 (file)
  *
  */
 #include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/export.h>
 
 #include "rds.h"
+#include "rdma.h"
+
+static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
 
 static unsigned int    rds_exthdr_size[__RDS_EXTHDR_MAX] = {
 [RDS_EXTHDR_NONE]      = 0,
@@ -49,7 +50,6 @@ void rds_message_addref(struct rds_message *rm)
        rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
        atomic_inc(&rm->m_refcount);
 }
-EXPORT_SYMBOL_GPL(rds_message_addref);
 
 /*
  * This relies on dma_map_sg() not touching sg[].page during merging.
@@ -61,28 +61,29 @@ static void rds_message_purge(struct rds_message *rm)
        if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
                return;
 
-       for (i = 0; i < rm->data.op_nents; i++) {
-               rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
+       for (i = 0; i < rm->m_nents; i++) {
+               rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
                /* XXX will have to put_page for page refs */
-               __free_page(sg_page(&rm->data.op_sg[i]));
+               __free_page(sg_page(&rm->m_sg[i]));
        }
-       rm->data.op_nents = 0;
+       rm->m_nents = 0;
 
-       if (rm->rdma.op_active)
-               rds_rdma_free_op(&rm->rdma);
-       if (rm->rdma.op_rdma_mr)
-               rds_mr_put(rm->rdma.op_rdma_mr);
+       if (rm->m_rdma_op)
+               rds_rdma_free_op(rm->m_rdma_op);
+       if (rm->m_rdma_mr)
+               rds_mr_put(rm->m_rdma_mr);
+}
 
-       if (rm->atomic.op_active)
-               rds_atomic_free_op(&rm->atomic);
-       if (rm->atomic.op_rdma_mr)
-               rds_mr_put(rm->atomic.op_rdma_mr);
+void rds_message_inc_purge(struct rds_incoming *inc)
+{
+       struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+       rds_message_purge(rm);
 }
 
 void rds_message_put(struct rds_message *rm)
 {
        rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
-       WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
+
        if (atomic_dec_and_test(&rm->m_refcount)) {
                BUG_ON(!list_empty(&rm->m_sock_item));
                BUG_ON(!list_empty(&rm->m_conn_item));
@@ -91,7 +92,12 @@ void rds_message_put(struct rds_message *rm)
                kfree(rm);
        }
 }
-EXPORT_SYMBOL_GPL(rds_message_put);
+
+void rds_message_inc_free(struct rds_incoming *inc)
+{
+       struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+       rds_message_put(rm);
+}
 
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
                                 __be16 dport, u64 seq)
@@ -102,10 +108,9 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
        hdr->h_sequence = cpu_to_be64(seq);
        hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
 }
-EXPORT_SYMBOL_GPL(rds_message_populate_header);
 
-int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
-                             const void *data, unsigned int len)
+int rds_message_add_extension(struct rds_header *hdr,
+               unsigned int type, const void *data, unsigned int len)
 {
        unsigned int ext_len = sizeof(u8) + len;
        unsigned char *dst;
@@ -114,7 +119,8 @@ int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
        if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
                return 0;
 
-       if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+       if (type >= __RDS_EXTHDR_MAX
+        || len != rds_exthdr_size[type])
                return 0;
 
        if (ext_len >= RDS_HEADER_EXT_SPACE)
@@ -127,7 +133,6 @@ int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
        dst[len] = RDS_EXTHDR_NONE;
        return 1;
 }
-EXPORT_SYMBOL_GPL(rds_message_add_extension);
 
 /*
  * If a message has extension headers, retrieve them here.
@@ -175,6 +180,26 @@ none:
        return RDS_EXTHDR_NONE;
 }
 
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
+{
+       struct rds_ext_header_version ext_hdr;
+
+       ext_hdr.h_version = cpu_to_be32(version);
+       return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
+}
+
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
+{
+       struct rds_ext_header_version ext_hdr;
+       unsigned int pos = 0, len = sizeof(ext_hdr);
+
+       /* We assume the version extension is the only one present */
+       if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
+               return 0;
+       *version = be32_to_cpu(ext_hdr.h_version);
+       return 1;
+}
+
 int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
 {
        struct rds_ext_header_rdma_dest ext_hdr;
@@ -183,80 +208,42 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
        ext_hdr.h_rdma_offset = cpu_to_be32(offset);
        return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
 }
-EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
 
-/*
- * Each rds_message is allocated with extra space for the scatterlist entries
- * rds ops will need. This is to minimize memory allocation count. Then, each rds op
- * can grab SGs when initializing its part of the rds_message.
- */
-struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
 {
        struct rds_message *rm;
 
-       if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message))
-               return NULL;
-
-       rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
+       rm = kzalloc(sizeof(struct rds_message) +
+                    (nents * sizeof(struct scatterlist)), gfp);
        if (!rm)
                goto out;
 
-       rm->m_used_sgs = 0;
-       rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
-
+       if (nents)
+               sg_init_table(rm->m_sg, nents);
        atomic_set(&rm->m_refcount, 1);
        INIT_LIST_HEAD(&rm->m_sock_item);
        INIT_LIST_HEAD(&rm->m_conn_item);
        spin_lock_init(&rm->m_rs_lock);
-       init_waitqueue_head(&rm->m_flush_wait);
 
 out:
        return rm;
 }
 
-/*
- * RDS ops use this to grab SG entries from the rm's sg pool.
- */
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
-{
-       struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
-       struct scatterlist *sg_ret;
-
-       WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
-       WARN_ON(!nents);
-
-       if (rm->m_used_sgs + nents > rm->m_total_sgs)
-               return NULL;
-
-       sg_ret = &sg_first[rm->m_used_sgs];
-       sg_init_table(sg_ret, nents);
-       rm->m_used_sgs += nents;
-
-       return sg_ret;
-}
-
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
 {
        struct rds_message *rm;
        unsigned int i;
-       int num_sgs = ceil(total_len, PAGE_SIZE);
-       int extra_bytes = num_sgs * sizeof(struct scatterlist);
 
-       rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
-       if (!rm)
+       rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+       if (rm == NULL)
                return ERR_PTR(-ENOMEM);
 
        set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
        rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
-       rm->data.op_nents = ceil(total_len, PAGE_SIZE);
-       rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
-       if (!rm->data.op_sg) {
-               rds_message_put(rm);
-               return ERR_PTR(-ENOMEM);
-       }
+       rm->m_nents = ceil(total_len, PAGE_SIZE);
 
-       for (i = 0; i < rm->data.op_nents; ++i) {
-               sg_set_page(&rm->data.op_sg[i],
+       for (i = 0; i < rm->m_nents; ++i) {
+               sg_set_page(&rm->m_sg[i],
                                virt_to_page(page_addrs[i]),
                                PAGE_SIZE, 0);
        }
@@ -264,54 +251,88 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
        return rm;
 }
 
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+                                              size_t total_len)
 {
-       unsigned long to_copy, nbytes;
+       unsigned long to_copy;
+       unsigned long iov_off;
        unsigned long sg_off;
+       struct rds_message *rm;
+       struct iovec *iov;
        struct scatterlist *sg;
-       int ret = 0;
+       int ret;
+
+       rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+       if (rm == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
 
-       rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
+       rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
 
        /*
         * now allocate and copy in the data payload.
         */
-       sg = rm->data.op_sg;
+       sg = rm->m_sg;
+       iov = first_iov;
+       iov_off = 0;
        sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
-       while (iov_iter_count(from)) {
-               if (!sg_page(sg)) {
-                       ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
+       while (total_len) {
+               if (sg_page(sg) == NULL) {
+                       ret = rds_page_remainder_alloc(sg, total_len,
                                                       GFP_HIGHUSER);
                        if (ret)
-                               return ret;
-                       rm->data.op_nents++;
+                               goto out;
+                       rm->m_nents++;
                        sg_off = 0;
                }
 
-               to_copy = min_t(unsigned long, iov_iter_count(from),
-                               sg->length - sg_off);
+               while (iov_off == iov->iov_len) {
+                       iov_off = 0;
+                       iov++;
+               }
+
+               to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
+               to_copy = min_t(size_t, to_copy, total_len);
+
+               rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
+                        "sg [%p, %u, %u] + %lu\n",
+                        to_copy, iov->iov_base, iov->iov_len, iov_off,
+                        (void *)sg_page(sg), sg->offset, sg->length, sg_off);
 
-               rds_stats_add(s_copy_from_user, to_copy);
-               nbytes = copy_page_from_iter(sg_page(sg), sg->offset + sg_off,
-                                            to_copy, from);
-               if (nbytes != to_copy)
-                       return -EFAULT;
+               ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
+                                             iov->iov_base + iov_off,
+                                             to_copy);
+               if (ret)
+                       goto out;
 
+               iov_off += to_copy;
+               total_len -= to_copy;
                sg_off += to_copy;
 
                if (sg_off == sg->length)
                        sg++;
        }
 
-       return ret;
+       ret = 0;
+out:
+       if (ret) {
+               if (rm)
+                       rds_message_put(rm);
+               rm = ERR_PTR(ret);
+       }
+       return rm;
 }
 
-int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+                                struct iovec *first_iov, size_t size)
 {
        struct rds_message *rm;
+       struct iovec *iov;
        struct scatterlist *sg;
        unsigned long to_copy;
+       unsigned long iov_off;
        unsigned long vec_off;
        int copied;
        int ret;
@@ -320,21 +341,36 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
        rm = container_of(inc, struct rds_message, m_inc);
        len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
 
-       sg = rm->data.op_sg;
+       iov = first_iov;
+       iov_off = 0;
+       sg = rm->m_sg;
        vec_off = 0;
        copied = 0;
 
-       while (iov_iter_count(to) && copied < len) {
-               to_copy = min_t(unsigned long, iov_iter_count(to),
-                               sg->length - vec_off);
+       while (copied < size && copied < len) {
+               while (iov_off == iov->iov_len) {
+                       iov_off = 0;
+                       iov++;
+               }
+
+               to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
+               to_copy = min_t(size_t, to_copy, size - copied);
                to_copy = min_t(unsigned long, to_copy, len - copied);
 
-               rds_stats_add(s_copy_to_user, to_copy);
-               ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off,
-                                       to_copy, to);
-               if (ret != to_copy)
-                       return -EFAULT;
+               rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
+                        "sg [%p, %u, %u] + %lu\n",
+                        to_copy, iov->iov_base, iov->iov_len, iov_off,
+                        sg_page(sg), sg->offset, sg->length, vec_off);
+
+               ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
+                                           iov->iov_base + iov_off,
+                                           to_copy);
+               if (ret) {
+                       copied = ret;
+                       break;
+               }
 
+               iov_off += to_copy;
                vec_off += to_copy;
                copied += to_copy;
 
@@ -353,14 +389,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
  */
 void rds_message_wait(struct rds_message *rm)
 {
-       wait_event_interruptible(rm->m_flush_wait,
+       wait_event(rds_message_flush_waitq,
                        !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
 }
 
 void rds_message_unmapped(struct rds_message *rm)
 {
        clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
-       wake_up_interruptible(&rm->m_flush_wait);
+       if (waitqueue_active(&rds_message_flush_waitq))
+               wake_up(&rds_message_flush_waitq);
 }
-EXPORT_SYMBOL_GPL(rds_message_unmapped);
 
index 9005a2c920ee6dccc045a15a707ae64ae59d3f87..c460743a89ad00c594fd8f606c083321e0ee8b39 100644 (file)
@@ -31,9 +31,6 @@
  *
  */
 #include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/cpu.h>
-#include <linux/export.h>
 
 #include "rds.h"
 
@@ -42,8 +39,7 @@ struct rds_page_remainder {
        unsigned long   r_offset;
 };
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
-                                    rds_page_remainders);
+DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
 
 /*
  * returns 0 on success or -errno on failure.
@@ -60,26 +56,37 @@ int rds_page_copy_user(struct page *page, unsigned long offset,
        unsigned long ret;
        void *addr;
 
-       addr = kmap(page);
-       if (to_user) {
+       if (to_user)
                rds_stats_add(s_copy_to_user, bytes);
-               ret = copy_to_user(ptr, addr + offset, bytes);
-       } else {
+       else
                rds_stats_add(s_copy_from_user, bytes);
-               ret = copy_from_user(addr + offset, ptr, bytes);
+
+       addr = kmap_atomic(page, KM_USER0);
+       if (to_user)
+               ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
+       else
+               ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
+       kunmap_atomic(addr, KM_USER0);
+
+       if (ret) {
+               addr = kmap(page);
+               if (to_user)
+                       ret = copy_to_user(ptr, addr + offset, bytes);
+               else
+                       ret = copy_from_user(addr + offset, ptr, bytes);
+               kunmap(page);
+               if (ret)
+                       return -EFAULT;
        }
-       kunmap(page);
 
-       return ret ? -EFAULT : 0;
+       return 0;
 }
-EXPORT_SYMBOL_GPL(rds_page_copy_user);
 
-/**
- * rds_page_remainder_alloc - build up regions of a message.
+/*
+ * Message allocation uses this to build up regions of a message.
  *
- * @scat: Scatter list for message
- * @bytes: the number of bytes needed.
- * @gfp: the waiting behaviour of the allocation
+ * @bytes - the number of bytes needed.
+ * @gfp - the waiting behaviour of the allocation
  *
  * @gfp is always ored with __GFP_HIGHMEM.  Callers must be prepared to
  * kmap the pages, etc.
@@ -107,7 +114,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
        /* jump straight to allocation if we're trying for a huge page */
        if (bytes >= PAGE_SIZE) {
                page = alloc_page(gfp);
-               if (!page) {
+               if (page == NULL) {
                        ret = -ENOMEM;
                } else {
                        sg_set_page(scat, page, PAGE_SIZE, 0);
@@ -153,7 +160,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
                rem = &per_cpu(rds_page_remainders, get_cpu());
                local_irq_save(flags);
 
-               if (!page) {
+               if (page == NULL) {
                        ret = -ENOMEM;
                        break;
                }
@@ -177,7 +184,6 @@ out:
                 ret ? 0 : scat->length);
        return ret;
 }
-EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
 
 static int rds_page_remainder_cpu_notify(struct notifier_block *self,
                                         unsigned long action, void *hcpu)
index 40084d843e9fe33bc1545f1f573a32780880e223..eaeeb91e11196a07405ee9cbfc417c7da3793ea3 100644 (file)
  *
  */
 #include <linux/pagemap.h>
-#include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
 
-#include "rds.h"
+#include "rdma.h"
 
 /*
  * XXX
@@ -130,22 +129,14 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
 {
        struct rds_mr *mr;
        struct rb_node *node;
-       unsigned long flags;
 
        /* Release any MRs associated with this socket */
-       spin_lock_irqsave(&rs->rs_rdma_lock, flags);
        while ((node = rb_first(&rs->rs_rdma_keys))) {
                mr = container_of(node, struct rds_mr, r_rb_node);
                if (mr->r_trans == rs->rs_transport)
                        mr->r_invalidate = 0;
-               rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
-               RB_CLEAR_NODE(&mr->r_rb_node);
-               spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
-               rds_destroy_mr(mr);
                rds_mr_put(mr);
-               spin_lock_irqsave(&rs->rs_rdma_lock, flags);
        }
-       spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 
        if (rs->rs_transport && rs->rs_transport->flush_mrs)
                rs->rs_transport->flush_mrs();
@@ -159,9 +150,12 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
 {
        int ret;
 
-       ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+       down_read(&current->mm->mmap_sem);
+       ret = get_user_pages(current, current->mm, user_addr,
+                            nr_pages, write, 0, pages, NULL);
+       up_read(&current->mm->mmap_sem);
 
-       if (ret >= 0 && ret < nr_pages) {
+       if (0 <= ret && (unsigned) ret < nr_pages) {
                while (ret--)
                        put_page(pages[ret]);
                ret = -EFAULT;
@@ -189,7 +183,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
                goto out;
        }
 
-       if (!rs->rs_transport->get_mr) {
+       if (rs->rs_transport->get_mr == NULL) {
                ret = -EOPNOTSUPP;
                goto out;
        }
@@ -205,13 +199,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 
        /* XXX clamp nr_pages to limit the size of this alloc? */
        pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-       if (!pages) {
+       if (pages == NULL) {
                ret = -ENOMEM;
                goto out;
        }
 
        mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
-       if (!mr) {
+       if (mr == NULL) {
                ret = -ENOMEM;
                goto out;
        }
@@ -238,13 +232,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
         * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
         * the zero page.
         */
-       ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
+       ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
        if (ret < 0)
                goto out;
 
        nents = ret;
        sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
-       if (!sg) {
+       if (sg == NULL) {
                ret = -ENOMEM;
                goto out;
        }
@@ -326,30 +320,6 @@ int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
        return __rds_rdma_map(rs, &args, NULL, NULL);
 }
 
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
-{
-       struct rds_get_mr_for_dest_args args;
-       struct rds_get_mr_args new_args;
-
-       if (optlen != sizeof(struct rds_get_mr_for_dest_args))
-               return -EINVAL;
-
-       if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
-                          sizeof(struct rds_get_mr_for_dest_args)))
-               return -EFAULT;
-
-       /*
-        * Initially, just behave like get_mr().
-        * TODO: Implement get_mr as wrapper around this
-        *       and deprecate it.
-        */
-       new_args.vec = args.vec;
-       new_args.cookie_addr = args.cookie_addr;
-       new_args.flags = args.flags;
-
-       return __rds_rdma_map(rs, &new_args, NULL, NULL);
-}
-
 /*
  * Free the MR indicated by the given R_Key
  */
@@ -414,217 +384,132 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 
        spin_lock_irqsave(&rs->rs_rdma_lock, flags);
        mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-       if (!mr) {
-               printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
-               spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
-               return;
-       }
-
-       if (mr->r_use_once || force) {
+       if (mr && (mr->r_use_once || force)) {
                rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
                RB_CLEAR_NODE(&mr->r_rb_node);
                zot_me = 1;
-       }
+       } else if (mr)
+               atomic_inc(&mr->r_refcount);
        spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 
        /* May have to issue a dma_sync on this memory region.
         * Note we could avoid this if the operation was a RDMA READ,
         * but at this point we can't tell. */
-       if (mr->r_trans->sync_mr)
-               mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
-
-       /* If the MR was marked as invalidate, this will
-        * trigger an async flush. */
-       if (zot_me)
-               rds_destroy_mr(mr);
-       rds_mr_put(mr);
+       if (mr != NULL) {
+               if (mr->r_trans->sync_mr)
+                       mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+               /* If the MR was marked as invalidate, this will
+                * trigger an async flush. */
+               if (zot_me)
+                       rds_destroy_mr(mr);
+               rds_mr_put(mr);
+       }
 }
 
-void rds_rdma_free_op(struct rm_rdma_op *ro)
+void rds_rdma_free_op(struct rds_rdma_op *ro)
 {
        unsigned int i;
 
-       for (i = 0; i < ro->op_nents; i++) {
-               struct page *page = sg_page(&ro->op_sg[i]);
+       for (i = 0; i < ro->r_nents; i++) {
+               struct page *page = sg_page(&ro->r_sg[i]);
 
                /* Mark page dirty if it was possibly modified, which
                 * is the case for a RDMA_READ which copies from remote
                 * to local memory */
-               if (!ro->op_write) {
-                       BUG_ON(irqs_disabled());
+               if (!ro->r_write)
                        set_page_dirty(page);
-               }
                put_page(page);
        }
 
-       kfree(ro->op_notifier);
-       ro->op_notifier = NULL;
-       ro->op_active = 0;
-}
-
-void rds_atomic_free_op(struct rm_atomic_op *ao)
-{
-       struct page *page = sg_page(ao->op_sg);
-
-       /* Mark page dirty if it was possibly modified, which
-        * is the case for a RDMA_READ which copies from remote
-        * to local memory */
-       set_page_dirty(page);
-       put_page(page);
-
-       kfree(ao->op_notifier);
-       ao->op_notifier = NULL;
-       ao->op_active = 0;
+       kfree(ro->r_notifier);
+       kfree(ro);
 }
 
-
 /*
- * Count the number of pages needed to describe an incoming iovec array.
+ * args is a pointer to an in-kernel copy in the sendmsg cmsg.
  */
-static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
-{
-       int tot_pages = 0;
-       unsigned int nr_pages;
-       unsigned int i;
-
-       /* figure out the number of pages in the vector */
-       for (i = 0; i < nr_iovecs; i++) {
-               nr_pages = rds_pages_in_vec(&iov[i]);
-               if (nr_pages == 0)
-                       return -EINVAL;
-
-               tot_pages += nr_pages;
-
-               /*
-                * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
-                * so tot_pages cannot overflow without first going negative.
-                */
-               if (tot_pages < 0)
-                       return -EINVAL;
-       }
-
-       return tot_pages;
-}
-
-int rds_rdma_extra_size(struct rds_rdma_args *args)
+static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
+                                           struct rds_rdma_args *args)
 {
        struct rds_iovec vec;
-       struct rds_iovec __user *local_vec;
-       int tot_pages = 0;
+       struct rds_rdma_op *op = NULL;
        unsigned int nr_pages;
-       unsigned int i;
-
-       local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
-
-       /* figure out the number of pages in the vector */
-       for (i = 0; i < args->nr_local; i++) {
-               if (copy_from_user(&vec, &local_vec[i],
-                                  sizeof(struct rds_iovec)))
-                       return -EFAULT;
-
-               nr_pages = rds_pages_in_vec(&vec);
-               if (nr_pages == 0)
-                       return -EINVAL;
-
-               tot_pages += nr_pages;
-
-               /*
-                * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
-                * so tot_pages cannot overflow without first going negative.
-                */
-               if (tot_pages < 0)
-                       return -EINVAL;
-       }
-
-       return tot_pages * sizeof(struct scatterlist);
-}
-
-/*
- * The application asks for a RDMA transfer.
- * Extract all arguments and set up the rdma_op
- */
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg)
-{
-       struct rds_rdma_args *args;
-       struct rm_rdma_op *op = &rm->rdma;
-       int nr_pages;
+       unsigned int max_pages;
        unsigned int nr_bytes;
        struct page **pages = NULL;
-       struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
-       int iov_size;
+       struct rds_iovec __user *local_vec;
+       struct scatterlist *sg;
+       unsigned int nr;
        unsigned int i, j;
-       int ret = 0;
-
-       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
-           || rm->rdma.op_active)
-               return -EINVAL;
+       int ret;
 
-       args = CMSG_DATA(cmsg);
 
        if (rs->rs_bound_addr == 0) {
                ret = -ENOTCONN; /* XXX not a great errno */
-               goto out_ret;
+               goto out;
        }
 
-       if (args->nr_local > UIO_MAXIOV) {
+       if (args->nr_local > (u64)UINT_MAX) {
                ret = -EMSGSIZE;
-               goto out_ret;
+               goto out;
        }
 
-       /* Check whether to allocate the iovec area */
-       iov_size = args->nr_local * sizeof(struct rds_iovec);
-       if (args->nr_local > UIO_FASTIOV) {
-               iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
-               if (!iovs) {
-                       ret = -ENOMEM;
-                       goto out_ret;
+       nr_pages = 0;
+       max_pages = 0;
+
+       local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+       /* figure out the number of pages in the vector */
+       for (i = 0; i < args->nr_local; i++) {
+               if (copy_from_user(&vec, &local_vec[i],
+                                  sizeof(struct rds_iovec))) {
+                       ret = -EFAULT;
+                       goto out;
                }
-       }
 
-       if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
-               ret = -EFAULT;
-               goto out;
-       }
+               nr = rds_pages_in_vec(&vec);
+               if (nr == 0) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
-       nr_pages = rds_rdma_pages(iovs, args->nr_local);
-       if (nr_pages < 0) {
-               ret = -EINVAL;
-               goto out;
+               max_pages = max(nr, max_pages);
+               nr_pages += nr;
        }
 
-       pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-       if (!pages) {
+       pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
+       if (pages == NULL) {
                ret = -ENOMEM;
                goto out;
        }
 
-       op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
-       op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
-       op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
-       op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
-       op->op_active = 1;
-       op->op_recverr = rs->rs_recverr;
-       WARN_ON(!nr_pages);
-       op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
-       if (!op->op_sg) {
+       op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
+       if (op == NULL) {
                ret = -ENOMEM;
                goto out;
        }
 
-       if (op->op_notify || op->op_recverr) {
+       op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
+       op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
+       op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+       op->r_recverr = rs->rs_recverr;
+       WARN_ON(!nr_pages);
+       sg_init_table(op->r_sg, nr_pages);
+
+       if (op->r_notify || op->r_recverr) {
                /* We allocate an uninitialized notifier here, because
                 * we don't want to do that in the completion handler. We
                 * would have to use GFP_ATOMIC there, and don't want to deal
                 * with failed allocations.
                 */
-               op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
-               if (!op->op_notifier) {
+               op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+               if (!op->r_notifier) {
                        ret = -ENOMEM;
                        goto out;
                }
-               op->op_notifier->n_user_token = args->user_token;
-               op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+               op->r_notifier->n_user_token = args->user_token;
+               op->r_notifier->n_status = RDS_RDMA_SUCCESS;
        }
 
        /* The cookie contains the R_Key of the remote memory region, and
@@ -634,55 +519,68 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
         * destination address (which is really an offset into the MR)
         * FIXME: We may want to move this into ib_rdma.c
         */
-       op->op_rkey = rds_rdma_cookie_key(args->cookie);
-       op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+       op->r_key = rds_rdma_cookie_key(args->cookie);
+       op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
 
        nr_bytes = 0;
 
        rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
               (unsigned long long)args->nr_local,
               (unsigned long long)args->remote_vec.addr,
-              op->op_rkey);
+              op->r_key);
 
        for (i = 0; i < args->nr_local; i++) {
-               struct rds_iovec *iov = &iovs[i];
-               /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
-               unsigned int nr = rds_pages_in_vec(iov);
+               if (copy_from_user(&vec, &local_vec[i],
+                                  sizeof(struct rds_iovec))) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+
+               nr = rds_pages_in_vec(&vec);
+               if (nr == 0) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
-               rs->rs_user_addr = iov->addr;
-               rs->rs_user_bytes = iov->bytes;
+               rs->rs_user_addr = vec.addr;
+               rs->rs_user_bytes = vec.bytes;
 
+               /* did the user change the vec under us? */
+               if (nr > max_pages || op->r_nents + nr > nr_pages) {
+                       ret = -EINVAL;
+                       goto out;
+               }
                /* If it's a WRITE operation, we want to pin the pages for reading.
                 * If it's a READ operation, we need to pin the pages for writing.
                 */
-               ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
+               ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
                if (ret < 0)
                        goto out;
 
-               rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
-                        nr_bytes, nr, iov->bytes, iov->addr);
+               rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n",
+                      nr_bytes, nr, vec.bytes, vec.addr);
 
-               nr_bytes += iov->bytes;
+               nr_bytes += vec.bytes;
 
                for (j = 0; j < nr; j++) {
-                       unsigned int offset = iov->addr & ~PAGE_MASK;
-                       struct scatterlist *sg;
+                       unsigned int offset = vec.addr & ~PAGE_MASK;
 
-                       sg = &op->op_sg[op->op_nents + j];
+                       sg = &op->r_sg[op->r_nents + j];
                        sg_set_page(sg, pages[j],
-                                       min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
+                                       min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
                                        offset);
 
-                       rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
-                              sg->offset, sg->length, iov->addr, iov->bytes);
+                       rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n",
+                              sg->offset, sg->length, vec.addr, vec.bytes);
 
-                       iov->addr += sg->length;
-                       iov->bytes -= sg->length;
+                       vec.addr += sg->length;
+                       vec.bytes -= sg->length;
                }
 
-               op->op_nents += nr;
+               op->r_nents += nr;
        }
 
+
        if (nr_bytes > args->remote_vec.bytes) {
                rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
                                nr_bytes,
@@ -690,19 +588,38 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
                ret = -EINVAL;
                goto out;
        }
-       op->op_bytes = nr_bytes;
+       op->r_bytes = nr_bytes;
 
+       ret = 0;
 out:
-       if (iovs != iovstack)
-               sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
        kfree(pages);
-out_ret:
-       if (ret)
-               rds_rdma_free_op(op);
-       else
-               rds_stats_inc(s_send_rdma);
+       if (ret) {
+               if (op)
+                       rds_rdma_free_op(op);
+               op = ERR_PTR(ret);
+       }
+       return op;
+}
 
-       return ret;
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg)
+{
+       struct rds_rdma_op *op;
+
+       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+        || rm->m_rdma_op != NULL)
+               return -EINVAL;
+
+       op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
+       if (IS_ERR(op))
+               return PTR_ERR(op);
+       rds_stats_inc(s_send_rdma);
+       rm->m_rdma_op = op;
+       return 0;
 }
 
 /*
@@ -717,8 +634,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
        u32 r_key;
        int err = 0;
 
-       if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
-           rm->m_rdma_cookie != 0)
+       if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t))
+        || rm->m_rdma_cookie != 0)
                return -EINVAL;
 
        memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
@@ -732,7 +649,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 
        spin_lock_irqsave(&rs->rs_rdma_lock, flags);
        mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-       if (!mr)
+       if (mr == NULL)
                err = -EINVAL;  /* invalid r_key */
        else
                atomic_inc(&mr->r_refcount);
@@ -740,7 +657,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 
        if (mr) {
                mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
-               rm->rdma.op_rdma_mr = mr;
+               rm->m_rdma_mr = mr;
        }
        return err;
 }
@@ -754,106 +671,9 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
                          struct cmsghdr *cmsg)
 {
-       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
-           rm->m_rdma_cookie != 0)
-               return -EINVAL;
-
-       return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
-}
-
-/*
- * Fill in rds_message for an atomic request.
- */
-int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
-                   struct cmsghdr *cmsg)
-{
-       struct page *page = NULL;
-       struct rds_atomic_args *args;
-       int ret = 0;
-
-       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
-        || rm->atomic.op_active)
+       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args))
+        || rm->m_rdma_cookie != 0)
                return -EINVAL;
 
-       args = CMSG_DATA(cmsg);
-
-       /* Nonmasked & masked cmsg ops converted to masked hw ops */
-       switch (cmsg->cmsg_type) {
-       case RDS_CMSG_ATOMIC_FADD:
-               rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
-               rm->atomic.op_m_fadd.add = args->fadd.add;
-               rm->atomic.op_m_fadd.nocarry_mask = 0;
-               break;
-       case RDS_CMSG_MASKED_ATOMIC_FADD:
-               rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
-               rm->atomic.op_m_fadd.add = args->m_fadd.add;
-               rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
-               break;
-       case RDS_CMSG_ATOMIC_CSWP:
-               rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
-               rm->atomic.op_m_cswp.compare = args->cswp.compare;
-               rm->atomic.op_m_cswp.swap = args->cswp.swap;
-               rm->atomic.op_m_cswp.compare_mask = ~0;
-               rm->atomic.op_m_cswp.swap_mask = ~0;
-               break;
-       case RDS_CMSG_MASKED_ATOMIC_CSWP:
-               rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
-               rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
-               rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
-               rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
-               rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
-               break;
-       default:
-               BUG(); /* should never happen */
-       }
-
-       rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
-       rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
-       rm->atomic.op_active = 1;
-       rm->atomic.op_recverr = rs->rs_recverr;
-       rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
-       if (!rm->atomic.op_sg) {
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       /* verify 8 byte-aligned */
-       if (args->local_addr & 0x7) {
-               ret = -EFAULT;
-               goto err;
-       }
-
-       ret = rds_pin_pages(args->local_addr, 1, &page, 1);
-       if (ret != 1)
-               goto err;
-       ret = 0;
-
-       sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
-
-       if (rm->atomic.op_notify || rm->atomic.op_recverr) {
-               /* We allocate an uninitialized notifier here, because
-                * we don't want to do that in the completion handler. We
-                * would have to use GFP_ATOMIC there, and don't want to deal
-                * with failed allocations.
-                */
-               rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
-               if (!rm->atomic.op_notifier) {
-                       ret = -ENOMEM;
-                       goto err;
-               }
-
-               rm->atomic.op_notifier->n_user_token = args->user_token;
-               rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
-       }
-
-       rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
-       rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
-
-       return ret;
-err:
-       if (page)
-               put_page(page);
-       kfree(rm->atomic.op_notifier);
-
-       return ret;
+       return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
 }
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
new file mode 100644 (file)
index 0000000..4255120
--- /dev/null
@@ -0,0 +1,84 @@
+#ifndef _RDS_RDMA_H
+#define _RDS_RDMA_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/scatterlist.h>
+
+#include "rds.h"
+
+struct rds_mr {
+       struct rb_node          r_rb_node;
+       atomic_t                r_refcount;
+       u32                     r_key;
+
+       /* A copy of the creation flags */
+       unsigned int            r_use_once:1;
+       unsigned int            r_invalidate:1;
+       unsigned int            r_write:1;
+
+       /* This is for RDS_MR_DEAD.
+        * It would be nice & consistent to make this part of the above
+        * bit field here, but we need to use test_and_set_bit.
+        */
+       unsigned long           r_state;
+       struct rds_sock         *r_sock; /* back pointer to the socket that owns us */
+       struct rds_transport    *r_trans;
+       void                    *r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD            0
+
+struct rds_rdma_op {
+       u32                     r_key;
+       u64                     r_remote_addr;
+       unsigned int            r_write:1;
+       unsigned int            r_fence:1;
+       unsigned int            r_notify:1;
+       unsigned int            r_recverr:1;
+       unsigned int            r_mapped:1;
+       struct rds_notifier     *r_notifier;
+       unsigned int            r_bytes;
+       unsigned int            r_nents;
+       unsigned int            r_count;
+       struct scatterlist      r_sg[0];
+};
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+       return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+       return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+       return cookie >> 32;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rds_rdma_op *ro);
+void rds_rdma_send_complete(struct rds_message *rm, int);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+       if (atomic_dec_and_test(&mr->r_refcount))
+               __rds_put_mr_final(mr);
+}
+
+#endif
index 6cd9d1deafc395d6573b7e3b801e666106d1f257..7b19024f97069e00340c20cb07daee54e28835de 100644 (file)
  * SOFTWARE.
  *
  */
-#include <linux/module.h>
 #include <rdma/rdma_cm.h>
 
 #include "rdma_transport.h"
 
-static struct rdma_cm_id *rds_rdma_listen_id;
-
-static char *rds_cm_event_strings[] = {
-#define RDS_CM_EVENT_STRING(foo) \
-               [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
-       RDS_CM_EVENT_STRING(ADDR_RESOLVED),
-       RDS_CM_EVENT_STRING(ADDR_ERROR),
-       RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
-       RDS_CM_EVENT_STRING(ROUTE_ERROR),
-       RDS_CM_EVENT_STRING(CONNECT_REQUEST),
-       RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
-       RDS_CM_EVENT_STRING(CONNECT_ERROR),
-       RDS_CM_EVENT_STRING(UNREACHABLE),
-       RDS_CM_EVENT_STRING(REJECTED),
-       RDS_CM_EVENT_STRING(ESTABLISHED),
-       RDS_CM_EVENT_STRING(DISCONNECTED),
-       RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
-       RDS_CM_EVENT_STRING(MULTICAST_JOIN),
-       RDS_CM_EVENT_STRING(MULTICAST_ERROR),
-       RDS_CM_EVENT_STRING(ADDR_CHANGE),
-       RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
-#undef RDS_CM_EVENT_STRING
-};
-
-static char *rds_cm_event_str(enum rdma_cm_event_type type)
-{
-       return rds_str_array(rds_cm_event_strings,
-                            ARRAY_SIZE(rds_cm_event_strings), type);
-};
+static struct rdma_cm_id *rds_iw_listen_id;
 
 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                              struct rdma_cm_event *event)
@@ -73,8 +44,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
        struct rds_transport *trans;
        int ret = 0;
 
-       rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
-                event->event, rds_cm_event_str(event->event));
+       rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
+                event->event);
 
        if (cm_id->device->node_type == RDMA_NODE_RNIC)
                trans = &rds_iw_transport;
@@ -130,7 +101,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                break;
 
        case RDMA_CM_EVENT_DISCONNECTED:
-               rdsdebug("DISCONNECT event - dropping connection "
+               printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
                        "%pI4->%pI4\n", &conn->c_laddr,
                         &conn->c_faddr);
                rds_conn_drop(conn);
@@ -138,8 +109,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
        default:
                /* things like device disconnect? */
-               printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
-                      event->event, rds_cm_event_str(event->event));
+               printk(KERN_ERR "unknown event %u\n", event->event);
+               BUG();
                break;
        }
 
@@ -147,28 +118,26 @@ out:
        if (conn)
                mutex_unlock(&conn->c_cm_lock);
 
-       rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
-                rds_cm_event_str(event->event), ret);
+       rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
 
        return ret;
 }
 
-static int rds_rdma_listen_init(void)
+static int __init rds_rdma_listen_init(void)
 {
        struct sockaddr_in sin;
        struct rdma_cm_id *cm_id;
        int ret;
 
-       cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
-                              IB_QPT_RC);
+       cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
        if (IS_ERR(cm_id)) {
                ret = PTR_ERR(cm_id);
-               printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+               printk(KERN_ERR "RDS/IW: failed to setup listener, "
                       "rdma_create_id() returned %d\n", ret);
-               return ret;
+               goto out;
        }
 
-       sin.sin_family = AF_INET;
+       sin.sin_family = PF_INET,
        sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
        sin.sin_port = (__force u16)htons(RDS_PORT);
 
@@ -178,21 +147,21 @@ static int rds_rdma_listen_init(void)
         */
        ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
        if (ret) {
-               printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+               printk(KERN_ERR "RDS/IW: failed to setup listener, "
                       "rdma_bind_addr() returned %d\n", ret);
                goto out;
        }
 
        ret = rdma_listen(cm_id, 128);
        if (ret) {
-               printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+               printk(KERN_ERR "RDS/IW: failed to setup listener, "
                       "rdma_listen() returned %d\n", ret);
                goto out;
        }
 
        rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
 
-       rds_rdma_listen_id = cm_id;
+       rds_iw_listen_id = cm_id;
        cm_id = NULL;
 out:
        if (cm_id)
@@ -202,14 +171,14 @@ out:
 
 static void rds_rdma_listen_stop(void)
 {
-       if (rds_rdma_listen_id) {
-               rdsdebug("cm %p\n", rds_rdma_listen_id);
-               rdma_destroy_id(rds_rdma_listen_id);
-               rds_rdma_listen_id = NULL;
+       if (rds_iw_listen_id) {
+               rdsdebug("cm %p\n", rds_iw_listen_id);
+               rdma_destroy_id(rds_iw_listen_id);
+               rds_iw_listen_id = NULL;
        }
 }
 
-static int rds_rdma_init(void)
+int __init rds_rdma_init(void)
 {
        int ret;
 
@@ -234,18 +203,12 @@ err_iw_init:
 out:
        return ret;
 }
-module_init(rds_rdma_init);
 
-static void rds_rdma_exit(void)
+void rds_rdma_exit(void)
 {
        /* stop listening first to ensure no new connections are attempted */
        rds_rdma_listen_stop();
        rds_ib_exit();
        rds_iw_exit();
 }
-module_exit(rds_rdma_exit);
-
-MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
-MODULE_DESCRIPTION("RDS: IB/iWARP transport");
-MODULE_LICENSE("Dual BSD/GPL");
 
index faba4e382695e36c12c25f981b043077f4d7363c..2f2c7d976c219c787d337db96a46a020a6a3355d 100644 (file)
@@ -11,6 +11,10 @@ int rds_rdma_conn_connect(struct rds_connection *conn);
 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                              struct rdma_cm_event *event);
 
+/* from rdma_transport.c */
+int rds_rdma_init(void);
+void rds_rdma_exit(void);
+
 /* from ib.c */
 extern struct rds_transport rds_ib_transport;
 int rds_ib_init(void);
index 0d41155a2258cbbd16e19171c3daa376e3a83877..619f0a30a4e566952642e27e1d2b49f5f546ad11 100644 (file)
@@ -36,8 +36,8 @@
 #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
 #else
 /* sigh, pr_debug() causes unused variable warnings */
-static inline __printf(1, 2)
-void rdsdebug(char *fmt, ...)
+static inline void __attribute__ ((format (printf, 1, 2)))
+rdsdebug(char *fmt, ...)
 {
 }
 #endif
@@ -50,6 +50,7 @@ void rdsdebug(char *fmt, ...)
 #define RDS_FRAG_SIZE  ((unsigned int)(1 << RDS_FRAG_SHIFT))
 
 #define RDS_CONG_MAP_BYTES     (65536 / 8)
+#define RDS_CONG_MAP_LONGS     (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
 #define RDS_CONG_MAP_PAGES     (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
 #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
 
@@ -79,7 +80,6 @@ enum {
 /* Bits for c_flags */
 #define RDS_LL_SEND_FULL       0
 #define RDS_RECONNECT_PENDING  1
-#define RDS_IN_XMIT            2
 
 struct rds_connection {
        struct hlist_node       c_hash_node;
@@ -91,13 +91,12 @@ struct rds_connection {
        struct rds_cong_map     *c_lcong;
        struct rds_cong_map     *c_fcong;
 
+       struct mutex            c_send_lock;    /* protect send ring */
        struct rds_message      *c_xmit_rm;
        unsigned long           c_xmit_sg;
        unsigned int            c_xmit_hdr_off;
        unsigned int            c_xmit_data_off;
-       unsigned int            c_xmit_atomic_sent;
        unsigned int            c_xmit_rdma_sent;
-       unsigned int            c_xmit_data_sent;
 
        spinlock_t              c_lock;         /* protect msg queues */
        u64                     c_next_tx_seq;
@@ -110,7 +109,6 @@ struct rds_connection {
        void                    *c_transport_data;
 
        atomic_t                c_state;
-       unsigned long           c_send_gen;
        unsigned long           c_flags;
        unsigned long           c_reconnect_jiffies;
        struct delayed_work     c_send_w;
@@ -118,10 +116,11 @@ struct rds_connection {
        struct delayed_work     c_conn_w;
        struct work_struct      c_down_w;
        struct mutex            c_cm_lock;      /* protect conn state & cm */
-       wait_queue_head_t       c_waitq;
 
        struct list_head        c_map_item;
        unsigned long           c_map_queued;
+       unsigned long           c_map_offset;
+       unsigned long           c_map_bytes;
 
        unsigned int            c_unacked_packets;
        unsigned int            c_unacked_bytes;
@@ -133,7 +132,7 @@ struct rds_connection {
 #define RDS_FLAG_CONG_BITMAP   0x01
 #define RDS_FLAG_ACK_REQUIRED  0x02
 #define RDS_FLAG_RETRANSMITTED 0x04
-#define RDS_MAX_ADV_CREDIT     255
+#define RDS_MAX_ADV_CREDIT     127
 
 /*
  * Maximum space available for extension headers.
@@ -207,48 +206,6 @@ struct rds_incoming {
        rds_rdma_cookie_t       i_rdma_cookie;
 };
 
-struct rds_mr {
-       struct rb_node          r_rb_node;
-       atomic_t                r_refcount;
-       u32                     r_key;
-
-       /* A copy of the creation flags */
-       unsigned int            r_use_once:1;
-       unsigned int            r_invalidate:1;
-       unsigned int            r_write:1;
-
-       /* This is for RDS_MR_DEAD.
-        * It would be nice & consistent to make this part of the above
-        * bit field here, but we need to use test_and_set_bit.
-        */
-       unsigned long           r_state;
-       struct rds_sock         *r_sock; /* back pointer to the socket that owns us */
-       struct rds_transport    *r_trans;
-       void                    *r_trans_private;
-};
-
-/* Flags for mr->r_state */
-#define RDS_MR_DEAD            0
-
-static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
-{
-       return r_key | (((u64) offset) << 32);
-}
-
-static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
-{
-       return cookie;
-}
-
-static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
-{
-       return cookie >> 32;
-}
-
-/* atomic operation types */
-#define RDS_ATOMIC_TYPE_CSWP           0
-#define RDS_ATOMIC_TYPE_FADD           1
-
 /*
  * m_sock_item and m_conn_item are on lists that are serialized under
  * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
@@ -301,71 +258,13 @@ struct rds_message {
         *   -> rs->rs_lock
         */
        spinlock_t              m_rs_lock;
-       wait_queue_head_t       m_flush_wait;
-
        struct rds_sock         *m_rs;
-
-       /* cookie to send to remote, in rds header */
+       struct rds_rdma_op      *m_rdma_op;
        rds_rdma_cookie_t       m_rdma_cookie;
-
-       unsigned int            m_used_sgs;
-       unsigned int            m_total_sgs;
-
-       void                    *m_final_op;
-
-       struct {
-               struct rm_atomic_op {
-                       int                     op_type;
-                       union {
-                               struct {
-                                       uint64_t        compare;
-                                       uint64_t        swap;
-                                       uint64_t        compare_mask;
-                                       uint64_t        swap_mask;
-                               } op_m_cswp;
-                               struct {
-                                       uint64_t        add;
-                                       uint64_t        nocarry_mask;
-                               } op_m_fadd;
-                       };
-
-                       u32                     op_rkey;
-                       u64                     op_remote_addr;
-                       unsigned int            op_notify:1;
-                       unsigned int            op_recverr:1;
-                       unsigned int            op_mapped:1;
-                       unsigned int            op_silent:1;
-                       unsigned int            op_active:1;
-                       struct scatterlist      *op_sg;
-                       struct rds_notifier     *op_notifier;
-
-                       struct rds_mr           *op_rdma_mr;
-               } atomic;
-               struct rm_rdma_op {
-                       u32                     op_rkey;
-                       u64                     op_remote_addr;
-                       unsigned int            op_write:1;
-                       unsigned int            op_fence:1;
-                       unsigned int            op_notify:1;
-                       unsigned int            op_recverr:1;
-                       unsigned int            op_mapped:1;
-                       unsigned int            op_silent:1;
-                       unsigned int            op_active:1;
-                       unsigned int            op_bytes;
-                       unsigned int            op_nents;
-                       unsigned int            op_count;
-                       struct scatterlist      *op_sg;
-                       struct rds_notifier     *op_notifier;
-
-                       struct rds_mr           *op_rdma_mr;
-               } rdma;
-               struct rm_data_op {
-                       unsigned int            op_active:1;
-                       unsigned int            op_nents;
-                       unsigned int            op_count;
-                       struct scatterlist      *op_sg;
-               } data;
-       };
+       struct rds_mr           *m_rdma_mr;
+       unsigned int            m_nents;
+       unsigned int            m_count;
+       struct scatterlist      m_sg[0];
 };
 
 /*
@@ -406,19 +305,17 @@ struct rds_notifier {
  *                 transport is responsible for other serialization, including
  *                 rds_recv_incoming().  This is called in process context but
  *                 should try hard not to block.
+ *
+ * @xmit_cong_map: This asks the transport to send the local bitmap down the
+ *                given connection.  XXX get a better story about the bitmap
+ *                flag and header.
  */
 
-#define RDS_TRANS_IB   0
-#define RDS_TRANS_IWARP        1
-#define RDS_TRANS_TCP  2
-#define RDS_TRANS_COUNT        3
-
 struct rds_transport {
        char                    t_name[TRANSNAMSIZ];
        struct list_head        t_item;
        struct module           *t_owner;
        unsigned int            t_prefer_loopback:1;
-       unsigned int            t_type;
 
        int (*laddr_check)(__be32 addr);
        int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
@@ -429,10 +326,13 @@ struct rds_transport {
        void (*xmit_complete)(struct rds_connection *conn);
        int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
                    unsigned int hdr_off, unsigned int sg, unsigned int off);
-       int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
-       int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
+       int (*xmit_cong_map)(struct rds_connection *conn,
+                            struct rds_cong_map *map, unsigned long offset);
+       int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
        int (*recv)(struct rds_connection *conn);
-       int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
+       int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
+                               size_t size);
+       void (*inc_purge)(struct rds_incoming *inc);
        void (*inc_free)(struct rds_incoming *inc);
 
        int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
@@ -461,11 +361,17 @@ struct rds_sock {
         * bound_addr used for both incoming and outgoing, no INADDR_ANY
         * support.
         */
-       struct hlist_node       rs_bound_node;
+       struct rb_node          rs_bound_node;
        __be32                  rs_bound_addr;
        __be32                  rs_conn_addr;
        __be16                  rs_bound_port;
        __be16                  rs_conn_port;
+
+       /*
+        * This is only used to communicate the transport between bind and
+        * initiating connections.  All other trans use is referenced through
+        * the connection.
+        */
        struct rds_transport    *rs_transport;
 
        /*
@@ -476,8 +382,6 @@ struct rds_sock {
 
        /* flag indicating we were congested or not */
        int                     rs_congested;
-       /* seen congestion (ENOBUFS) when sending? */
-       int                     rs_seen_congestion;
 
        /* rs_lock protects all these adjacent members before the newline */
        spinlock_t              rs_lock;
@@ -554,8 +458,8 @@ struct rds_statistics {
        uint64_t        s_recv_ping;
        uint64_t        s_send_queue_empty;
        uint64_t        s_send_queue_full;
-       uint64_t        s_send_lock_contention;
-       uint64_t        s_send_lock_queue_raced;
+       uint64_t        s_send_sem_contention;
+       uint64_t        s_send_sem_queue_raced;
        uint64_t        s_send_immediate_retry;
        uint64_t        s_send_delayed_retry;
        uint64_t        s_send_drop_acked;
@@ -575,13 +479,12 @@ struct rds_statistics {
 };
 
 /* af_rds.c */
-char *rds_str_array(char **array, size_t elements, size_t index);
 void rds_sock_addref(struct rds_sock *rs);
 void rds_sock_put(struct rds_sock *rs);
 void rds_wake_sk_sleep(struct rds_sock *rs);
 static inline void __rds_wake_sk_sleep(struct sock *sk)
 {
-       wait_queue_head_t *waitq = sk_sleep(sk);
+       wait_queue_head_t *waitq = sk->sk_sleep;
 
        if (!sock_flag(sk, SOCK_DEAD) && waitq)
                wake_up(waitq);
@@ -610,23 +513,22 @@ void rds_cong_exit(void);
 struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
 
 /* conn.c */
-int rds_conn_init(void);
+int __init rds_conn_init(void);
 void rds_conn_exit(void);
 struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
                                       struct rds_transport *trans, gfp_t gfp);
 struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
                               struct rds_transport *trans, gfp_t gfp);
-void rds_conn_shutdown(struct rds_connection *conn);
 void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_reset(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
-void rds_conn_connect_if_down(struct rds_connection *conn);
 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens,
                          int (*visitor)(struct rds_connection *, void *),
                          size_t item_len);
-__printf(2, 3)
-void __rds_conn_error(struct rds_connection *conn, const char *, ...);
+void __rds_conn_error(struct rds_connection *conn, const char *, ...)
+                               __attribute__ ((format (printf, 2, 3)));
 #define rds_conn_error(conn, fmt...) \
        __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
 
@@ -656,8 +558,8 @@ rds_conn_connecting(struct rds_connection *conn)
 
 /* message.c */
 struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from);
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+                                              size_t total_len);
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
                                 __be16 dport, u64 seq);
@@ -665,8 +567,12 @@ int rds_message_add_extension(struct rds_header *hdr,
                              unsigned int type, const void *data, unsigned int len);
 int rds_message_next_extension(struct rds_header *hdr,
                               unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
 int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
-int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+                                struct iovec *first_iov, size_t size);
+void rds_message_inc_purge(struct rds_incoming *inc);
 void rds_message_inc_free(struct rds_incoming *inc);
 void rds_message_addref(struct rds_message *rm);
 void rds_message_put(struct rds_message *rm);
@@ -700,11 +606,12 @@ void rds_page_exit(void);
 /* recv.c */
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
                  __be32 saddr);
+void rds_inc_addref(struct rds_incoming *inc);
 void rds_inc_put(struct rds_incoming *inc);
 void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
-                      struct rds_incoming *inc, gfp_t gfp);
-int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
-               int msg_flags);
+                      struct rds_incoming *inc, gfp_t gfp, enum km_type km);
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+               size_t size, int msg_flags);
 void rds_clear_recv_queue(struct rds_sock *rs);
 int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
 void rds_inc_info_copy(struct rds_incoming *inc,
@@ -712,7 +619,8 @@ void rds_inc_info_copy(struct rds_incoming *inc,
                       __be32 saddr, __be32 daddr, int flip);
 
 /* send.c */
-int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+               size_t payload_len);
 void rds_send_reset(struct rds_connection *conn);
 int rds_send_xmit(struct rds_connection *conn);
 struct sockaddr_in;
@@ -720,41 +628,17 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
 typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
 void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
                         is_acked_func is_acked);
+int rds_send_acked_before(struct rds_connection *conn, u64 seq);
+void rds_send_remove_from_sock(struct list_head *messages, int status);
 int rds_send_pong(struct rds_connection *conn, __be16 dport);
 struct rds_message *rds_send_get_message(struct rds_connection *,
-                                        struct rm_rdma_op *);
+                                        struct rds_rdma_op *);
 
 /* rdma.c */
 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
-void rds_rdma_drop_keys(struct rds_sock *rs);
-int rds_rdma_extra_size(struct rds_rdma_args *args);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-void rds_rdma_free_op(struct rm_rdma_op *ro);
-void rds_atomic_free_op(struct rm_atomic_op *ao);
-void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
-void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
-int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
-                   struct cmsghdr *cmsg);
-
-void __rds_put_mr_final(struct rds_mr *mr);
-static inline void rds_mr_put(struct rds_mr *mr)
-{
-       if (atomic_dec_and_test(&mr->r_refcount))
-               __rds_put_mr_final(mr);
-}
 
 /* stats.c */
-DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+DECLARE_PER_CPU(struct rds_statistics, rds_stats);
 #define rds_stats_inc_which(which, member) do {                \
        per_cpu(which, get_cpu()).member++;             \
        put_cpu();                                      \
@@ -765,14 +649,13 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
        put_cpu();                                      \
 } while (0)
 #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
-int rds_stats_init(void);
+int __init rds_stats_init(void);
 void rds_stats_exit(void);
 void rds_stats_info_copy(struct rds_info_iterator *iter,
-                        uint64_t *values, const char *const *names,
-                        size_t nr);
+                        uint64_t *values, char **names, size_t nr);
 
 /* sysctl.c */
-int rds_sysctl_init(void);
+int __init rds_sysctl_init(void);
 void rds_sysctl_exit(void);
 extern unsigned long rds_sysctl_sndbuf_min;
 extern unsigned long rds_sysctl_sndbuf_default;
@@ -786,10 +669,9 @@ extern unsigned long rds_sysctl_trace_flags;
 extern unsigned int  rds_sysctl_trace_level;
 
 /* threads.c */
-int rds_threads_init(void);
+int __init rds_threads_init(void);
 void rds_threads_exit(void);
 extern struct workqueue_struct *rds_wq;
-void rds_queue_reconnect(struct rds_connection *conn);
 void rds_connect_worker(struct work_struct *);
 void rds_shutdown_worker(struct work_struct *);
 void rds_send_worker(struct work_struct *);
@@ -800,10 +682,9 @@ void rds_connect_complete(struct rds_connection *conn);
 int rds_trans_register(struct rds_transport *trans);
 void rds_trans_unregister(struct rds_transport *trans);
 struct rds_transport *rds_trans_get_preferred(__be32 addr);
-void rds_trans_put(struct rds_transport *trans);
 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
                                       unsigned int avail);
-int rds_trans_init(void);
+int __init rds_trans_init(void);
 void rds_trans_exit(void);
 
 #endif
index a00462b0d01de9ee2793d4fb273bf568b2eefc29..f2118c51cfa3b794c13177870e8966b600d667cf 100644 (file)
  *
  */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <net/sock.h>
 #include <linux/in.h>
-#include <linux/export.h>
 
 #include "rds.h"
+#include "rdma.h"
 
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
                  __be32 saddr)
@@ -47,9 +46,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
        inc->i_saddr = saddr;
        inc->i_rdma_cookie = 0;
 }
-EXPORT_SYMBOL_GPL(rds_inc_init);
 
-static void rds_inc_addref(struct rds_incoming *inc)
+void rds_inc_addref(struct rds_incoming *inc)
 {
        rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
        atomic_inc(&inc->i_refcount);
@@ -64,7 +62,6 @@ void rds_inc_put(struct rds_incoming *inc)
                inc->i_conn->c_trans->inc_free(inc);
        }
 }
-EXPORT_SYMBOL_GPL(rds_inc_put);
 
 static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
                                  struct rds_cong_map *map,
@@ -155,7 +152,7 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock
  * tell us which roles the addrs in the conn are playing for this message.
  */
 void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
-                      struct rds_incoming *inc, gfp_t gfp)
+                      struct rds_incoming *inc, gfp_t gfp, enum km_type km)
 {
        struct rds_sock *rs = NULL;
        struct sock *sk;
@@ -195,8 +192,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
         * XXX we could spend more on the wire to get more robust failure
         * detection, arguably worth it to avoid data corruption.
         */
-       if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
-           (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+       if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
+        && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
                rds_stats_inc(s_recv_drop_old_seq);
                goto out;
        }
@@ -209,7 +206,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
        }
 
        rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
-       if (!rs) {
+       if (rs == NULL) {
                rds_stats_inc(s_recv_drop_no_sock);
                goto out;
        }
@@ -240,7 +237,6 @@ out:
        if (rs)
                rds_sock_put(rs);
 }
-EXPORT_SYMBOL_GPL(rds_recv_incoming);
 
 /*
  * be very careful here.  This is being called as the condition in
@@ -250,7 +246,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
 {
        unsigned long flags;
 
-       if (!*inc) {
+       if (*inc == NULL) {
                read_lock_irqsave(&rs->rs_recv_lock, flags);
                if (!list_empty(&rs->rs_recv_queue)) {
                        *inc = list_entry(rs->rs_recv_queue.next,
@@ -296,7 +292,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
 int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
 {
        struct rds_notifier *notifier;
-       struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
+       struct rds_rdma_notify cmsg;
        unsigned int count = 0, max_messages = ~0U;
        unsigned long flags;
        LIST_HEAD(copy);
@@ -333,10 +329,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
 
                if (msghdr) {
                        cmsg.user_token = notifier->n_user_token;
-                       cmsg.status = notifier->n_status;
+                       cmsg.status  = notifier->n_status;
 
                        err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
-                                      sizeof(cmsg), &cmsg);
+                                       sizeof(cmsg), &cmsg);
                        if (err)
                                break;
                }
@@ -395,14 +391,14 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
        return 0;
 }
 
-int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
-               int msg_flags)
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+               size_t size, int msg_flags)
 {
        struct sock *sk = sock->sk;
        struct rds_sock *rs = rds_sk_to_rs(sk);
        long timeo;
        int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
-       DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+       struct sockaddr_in *sin;
        struct rds_incoming *inc = NULL;
 
        /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
@@ -413,29 +409,27 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
        if (msg_flags & MSG_OOB)
                goto out;
 
-       while (1) {
-               struct iov_iter save;
-               /* If there are pending notifications, do those - and nothing else */
-               if (!list_empty(&rs->rs_notify_queue)) {
-                       ret = rds_notify_queue_get(rs, msg);
-                       break;
-               }
+       /* If there are pending notifications, do those - and nothing else */
+       if (!list_empty(&rs->rs_notify_queue)) {
+               ret = rds_notify_queue_get(rs, msg);
+               goto out;
+       }
 
-               if (rs->rs_cong_notify) {
-                       ret = rds_notify_cong(rs, msg);
-                       break;
-               }
+       if (rs->rs_cong_notify) {
+               ret = rds_notify_cong(rs, msg);
+               goto out;
+       }
 
+       while (1) {
                if (!rds_next_incoming(rs, &inc)) {
                        if (nonblock) {
                                ret = -EAGAIN;
                                break;
                        }
 
-                       timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
-                                       (!list_empty(&rs->rs_notify_queue) ||
-                                        rs->rs_cong_notify ||
-                                        rds_next_incoming(rs, &inc)), timeo);
+                       timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+                                               rds_next_incoming(rs, &inc),
+                                               timeo);
                        rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
                                 timeo);
                        if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
@@ -450,8 +444,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
                         &inc->i_conn->c_faddr,
                         ntohs(inc->i_hdr.h_sport));
-               save = msg->msg_iter;
-               ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
+               ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
+                                                            size);
                if (ret < 0)
                        break;
 
@@ -464,7 +458,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        rds_inc_put(inc);
                        inc = NULL;
                        rds_stats_inc(s_recv_deliver_raced);
-                       msg->msg_iter = save;
                        continue;
                }
 
@@ -481,12 +474,12 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
                rds_stats_inc(s_recv_delivered);
 
+               sin = (struct sockaddr_in *)msg->msg_name;
                if (sin) {
                        sin->sin_family = AF_INET;
                        sin->sin_port = inc->i_hdr.h_sport;
                        sin->sin_addr.s_addr = inc->i_saddr;
                        memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
-                       msg->msg_namelen = sizeof(*sin);
                }
                break;
        }
index e9430f537f9c2bb23bbaeeb66933e1e85058bd34..104fe033203da51eeef849782e1c291e072cdde6 100644 (file)
  *
  */
 #include <linux/kernel.h>
-#include <linux/moduleparam.h>
-#include <linux/gfp.h>
 #include <net/sock.h>
 #include <linux/in.h>
 #include <linux/list.h>
-#include <linux/ratelimit.h>
-#include <linux/export.h>
 
 #include "rds.h"
+#include "rdma.h"
 
 /* When transmitting messages in rds_send_xmit, we need to emerge from
  * time to time and briefly release the CPU. Otherwise the softlock watchdog
@@ -55,11 +52,8 @@ static int send_batch_count = 64;
 module_param(send_batch_count, int, 0444);
 MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
 
-static void rds_send_remove_from_sock(struct list_head *messages, int status);
-
 /*
- * Reset the send state.  Callers must ensure that this doesn't race with
- * rds_send_xmit().
+ * Reset the send state. Caller must hold c_send_lock when calling here.
  */
 void rds_send_reset(struct rds_connection *conn)
 {
@@ -67,22 +61,18 @@ void rds_send_reset(struct rds_connection *conn)
        unsigned long flags;
 
        if (conn->c_xmit_rm) {
-               rm = conn->c_xmit_rm;
-               conn->c_xmit_rm = NULL;
                /* Tell the user the RDMA op is no longer mapped by the
                 * transport. This isn't entirely true (it's flushed out
                 * independently) but as the connection is down, there's
                 * no ongoing RDMA to/from that memory */
-               rds_message_unmapped(rm);
-               rds_message_put(rm);
+               rds_message_unmapped(conn->c_xmit_rm);
+               rds_message_put(conn->c_xmit_rm);
+               conn->c_xmit_rm = NULL;
        }
-
        conn->c_xmit_sg = 0;
        conn->c_xmit_hdr_off = 0;
        conn->c_xmit_data_off = 0;
-       conn->c_xmit_atomic_sent = 0;
        conn->c_xmit_rdma_sent = 0;
-       conn->c_xmit_data_sent = 0;
 
        conn->c_map_queued = 0;
 
@@ -99,27 +89,8 @@ void rds_send_reset(struct rds_connection *conn)
        spin_unlock_irqrestore(&conn->c_lock, flags);
 }
 
-static int acquire_in_xmit(struct rds_connection *conn)
-{
-       return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
-}
-
-static void release_in_xmit(struct rds_connection *conn)
-{
-       clear_bit(RDS_IN_XMIT, &conn->c_flags);
-       smp_mb__after_atomic();
-       /*
-        * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
-        * hot path and finding waiters is very rare.  We don't want to walk
-        * the system-wide hashed waitqueue buckets in the fast path only to
-        * almost never find waiters.
-        */
-       if (waitqueue_active(&conn->c_waitq))
-               wake_up_all(&conn->c_waitq);
-}
-
 /*
- * We're making the conscious trade-off here to only send one message
+ * We're making the concious trade-off here to only send one message
  * down the connection at a time.
  *   Pro:
  *      - tx queueing is a simple fifo list
@@ -137,14 +108,11 @@ int rds_send_xmit(struct rds_connection *conn)
        struct rds_message *rm;
        unsigned long flags;
        unsigned int tmp;
+       unsigned int send_quota = send_batch_count;
        struct scatterlist *sg;
        int ret = 0;
+       int was_empty = 0;
        LIST_HEAD(to_be_dropped);
-       int batch_count;
-       unsigned long send_gen = 0;
-
-restart:
-       batch_count = 0;
 
        /*
         * sendmsg calls here after having queued its message on the send
@@ -152,31 +120,14 @@ restart:
         * another thread is already feeding the queue then we back off.  This
         * avoids blocking the caller and trading per-connection data between
         * caches per message.
-        */
-       if (!acquire_in_xmit(conn)) {
-               rds_stats_inc(s_send_lock_contention);
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /*
-        * we record the send generation after doing the xmit acquire.
-        * if someone else manages to jump in and do some work, we'll use
-        * this to avoid a goto restart farther down.
         *
-        * The acquire_in_xmit() check above ensures that only one
-        * caller can increment c_send_gen at any time.
+        * The sem holder will issue a retry if they notice that someone queued
+        * a message after they stopped walking the send queue but before they
+        * dropped the sem.
         */
-       conn->c_send_gen++;
-       send_gen = conn->c_send_gen;
-
-       /*
-        * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
-        * we do the opposite to avoid races.
-        */
-       if (!rds_conn_up(conn)) {
-               release_in_xmit(conn);
-               ret = 0;
+       if (!mutex_trylock(&conn->c_send_lock)) {
+               rds_stats_inc(s_send_sem_contention);
+               ret = -ENOMEM;
                goto out;
        }
 
@@ -185,47 +136,76 @@ restart:
 
        /*
         * spin trying to push headers and data down the connection until
-        * the connection doesn't make forward progress.
+        * the connection doens't make forward progress.
         */
-       while (1) {
+       while (--send_quota) {
+               /*
+                * See if need to send a congestion map update if we're
+                * between sending messages.  The send_sem protects our sole
+                * use of c_map_offset and _bytes.
+                * Note this is used only by transports that define a special
+                * xmit_cong_map function. For all others, we create allocate
+                * a cong_map message and treat it just like any other send.
+                */
+               if (conn->c_map_bytes) {
+                       ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
+                                               conn->c_map_offset);
+                       if (ret <= 0)
+                               break;
+
+                       conn->c_map_offset += ret;
+                       conn->c_map_bytes -= ret;
+                       if (conn->c_map_bytes)
+                               continue;
+               }
 
+               /* If we're done sending the current message, clear the
+                * offset and S/G temporaries.
+                */
                rm = conn->c_xmit_rm;
+               if (rm != NULL &&
+                   conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+                   conn->c_xmit_sg == rm->m_nents) {
+                       conn->c_xmit_rm = NULL;
+                       conn->c_xmit_sg = 0;
+                       conn->c_xmit_hdr_off = 0;
+                       conn->c_xmit_data_off = 0;
+                       conn->c_xmit_rdma_sent = 0;
 
-               /*
-                * If between sending messages, we can send a pending congestion
-                * map update.
+                       /* Release the reference to the previous message. */
+                       rds_message_put(rm);
+                       rm = NULL;
+               }
+
+               /* If we're asked to send a cong map update, do so.
                 */
-               if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+               if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
+                       if (conn->c_trans->xmit_cong_map != NULL) {
+                               conn->c_map_offset = 0;
+                               conn->c_map_bytes = sizeof(struct rds_header) +
+                                       RDS_CONG_MAP_BYTES;
+                               continue;
+                       }
+
                        rm = rds_cong_update_alloc(conn);
                        if (IS_ERR(rm)) {
                                ret = PTR_ERR(rm);
                                break;
                        }
-                       rm->data.op_active = 1;
 
                        conn->c_xmit_rm = rm;
                }
 
                /*
-                * If not already working on one, grab the next message.
+                * Grab the next message from the send queue, if there is one.
                 *
                 * c_xmit_rm holds a ref while we're sending this message down
                 * the connction.  We can use this ref while holding the
                 * send_sem.. rds_send_reset() is serialized with it.
                 */
-               if (!rm) {
+               if (rm == NULL) {
                        unsigned int len;
 
-                       batch_count++;
-
-                       /* we want to process as big a batch as we can, but
-                        * we also want to avoid softlockups.  If we've been
-                        * through a lot of messages, lets back off and see
-                        * if anyone else jumps in
-                        */
-                       if (batch_count >= 1024)
-                               goto over_batch;
-
                        spin_lock_irqsave(&conn->c_lock, flags);
 
                        if (!list_empty(&conn->c_send_queue)) {
@@ -243,8 +223,10 @@ restart:
 
                        spin_unlock_irqrestore(&conn->c_lock, flags);
 
-                       if (!rm)
+                       if (rm == NULL) {
+                               was_empty = 1;
                                break;
+                       }
 
                        /* Unfortunately, the way Infiniband deals with
                         * RDMA to a bad MR key is by moving the entire
@@ -253,19 +235,20 @@ restart:
                         * connection.
                         * Therefore, we never retransmit messages with RDMA ops.
                         */
-                       if (rm->rdma.op_active &&
-                           test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+                       if (rm->m_rdma_op
+                        && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
                                spin_lock_irqsave(&conn->c_lock, flags);
                                if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
                                        list_move(&rm->m_conn_item, &to_be_dropped);
                                spin_unlock_irqrestore(&conn->c_lock, flags);
+                               rds_message_put(rm);
                                continue;
                        }
 
                        /* Require an ACK every once in a while */
                        len = ntohl(rm->m_inc.i_hdr.h_len);
-                       if (conn->c_unacked_packets == 0 ||
-                           conn->c_unacked_bytes < len) {
+                       if (conn->c_unacked_packets == 0
+                        || conn->c_unacked_bytes < len) {
                                __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
 
                                conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
@@ -279,55 +262,23 @@ restart:
                        conn->c_xmit_rm = rm;
                }
 
-               /* The transport either sends the whole rdma or none of it */
-               if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
-                       rm->m_final_op = &rm->rdma;
-                       ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
+               /*
+                * Try and send an rdma message.  Let's see if we can
+                * keep this simple and require that the transport either
+                * send the whole rdma or none of it.
+                */
+               if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
+                       ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
                        if (ret)
                                break;
                        conn->c_xmit_rdma_sent = 1;
-
-                       /* The transport owns the mapped memory for now.
-                        * You can't unmap it while it's on the send queue */
-                       set_bit(RDS_MSG_MAPPED, &rm->m_flags);
-               }
-
-               if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
-                       rm->m_final_op = &rm->atomic;
-                       ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
-                       if (ret)
-                               break;
-                       conn->c_xmit_atomic_sent = 1;
-
                        /* The transport owns the mapped memory for now.
                         * You can't unmap it while it's on the send queue */
                        set_bit(RDS_MSG_MAPPED, &rm->m_flags);
                }
 
-               /*
-                * A number of cases require an RDS header to be sent
-                * even if there is no data.
-                * We permit 0-byte sends; rds-ping depends on this.
-                * However, if there are exclusively attached silent ops,
-                * we skip the hdr/data send, to enable silent operation.
-                */
-               if (rm->data.op_nents == 0) {
-                       int ops_present;
-                       int all_ops_are_silent = 1;
-
-                       ops_present = (rm->atomic.op_active || rm->rdma.op_active);
-                       if (rm->atomic.op_active && !rm->atomic.op_silent)
-                               all_ops_are_silent = 0;
-                       if (rm->rdma.op_active && !rm->rdma.op_silent)
-                               all_ops_are_silent = 0;
-
-                       if (ops_present && all_ops_are_silent
-                           && !rm->m_rdma_cookie)
-                               rm->data.op_active = 0;
-               }
-
-               if (rm->data.op_active && !conn->c_xmit_data_sent) {
-                       rm->m_final_op = &rm->data;
+               if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
+                   conn->c_xmit_sg < rm->m_nents) {
                        ret = conn->c_trans->xmit(conn, rm,
                                                  conn->c_xmit_hdr_off,
                                                  conn->c_xmit_sg,
@@ -343,7 +294,7 @@ restart:
                                ret -= tmp;
                        }
 
-                       sg = &rm->data.op_sg[conn->c_xmit_sg];
+                       sg = &rm->m_sg[conn->c_xmit_sg];
                        while (ret) {
                                tmp = min_t(int, ret, sg->length -
                                                      conn->c_xmit_data_off);
@@ -354,68 +305,49 @@ restart:
                                        sg++;
                                        conn->c_xmit_sg++;
                                        BUG_ON(ret != 0 &&
-                                              conn->c_xmit_sg == rm->data.op_nents);
+                                              conn->c_xmit_sg == rm->m_nents);
                                }
                        }
-
-                       if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
-                           (conn->c_xmit_sg == rm->data.op_nents))
-                               conn->c_xmit_data_sent = 1;
-               }
-
-               /*
-                * A rm will only take multiple times through this loop
-                * if there is a data op. Thus, if the data is sent (or there was
-                * none), then we're done with the rm.
-                */
-               if (!rm->data.op_active || conn->c_xmit_data_sent) {
-                       conn->c_xmit_rm = NULL;
-                       conn->c_xmit_sg = 0;
-                       conn->c_xmit_hdr_off = 0;
-                       conn->c_xmit_data_off = 0;
-                       conn->c_xmit_rdma_sent = 0;
-                       conn->c_xmit_atomic_sent = 0;
-                       conn->c_xmit_data_sent = 0;
-
-                       rds_message_put(rm);
                }
        }
 
-over_batch:
-       if (conn->c_trans->xmit_complete)
-               conn->c_trans->xmit_complete(conn);
-       release_in_xmit(conn);
-
        /* Nuke any messages we decided not to retransmit. */
-       if (!list_empty(&to_be_dropped)) {
-               /* irqs on here, so we can put(), unlike above */
-               list_for_each_entry(rm, &to_be_dropped, m_conn_item)
-                       rds_message_put(rm);
+       if (!list_empty(&to_be_dropped))
                rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
-       }
+
+       if (conn->c_trans->xmit_complete)
+               conn->c_trans->xmit_complete(conn);
 
        /*
-        * Other senders can queue a message after we last test the send queue
-        * but before we clear RDS_IN_XMIT.  In that case they'd back off and
-        * not try and send their newly queued message.  We need to check the
-        * send queue after having cleared RDS_IN_XMIT so that their message
-        * doesn't get stuck on the send queue.
+        * We might be racing with another sender who queued a message but
+        * backed off on noticing that we held the c_send_lock.  If we check
+        * for queued messages after dropping the sem then either we'll
+        * see the queued message or the queuer will get the sem.  If we
+        * notice the queued message then we trigger an immediate retry.
         *
-        * If the transport cannot continue (i.e ret != 0), then it must
-        * call us when more room is available, such as from the tx
-        * completion handler.
-        *
-        * We have an extra generation check here so that if someone manages
-        * to jump in after our release_in_xmit, we'll see that they have done
-        * some work and we will skip our goto
+        * We need to be careful only to do this when we stopped processing
+        * the send queue because it was empty.  It's the only way we
+        * stop processing the loop when the transport hasn't taken
+        * responsibility for forward progress.
         */
-       if (ret == 0) {
-               smp_mb();
-               if (!list_empty(&conn->c_send_queue) &&
-                   send_gen == conn->c_send_gen) {
-                       rds_stats_inc(s_send_lock_queue_raced);
-                       goto restart;
+       mutex_unlock(&conn->c_send_lock);
+
+       if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
+               /* We exhausted the send quota, but there's work left to
+                * do. Return and (re-)schedule the send worker.
+                */
+               ret = -EAGAIN;
+       }
+
+       if (ret == 0 && was_empty) {
+               /* A simple bit test would be way faster than taking the
+                * spin lock */
+               spin_lock_irqsave(&conn->c_lock, flags);
+               if (!list_empty(&conn->c_send_queue)) {
+                       rds_stats_inc(s_send_sem_queue_raced);
+                       ret = -EAGAIN;
                }
+               spin_unlock_irqrestore(&conn->c_lock, flags);
        }
 out:
        return ret;
@@ -443,60 +375,52 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
 }
 
 /*
- * This is pretty similar to what happens below in the ACK
- * handling code - except that we call here as soon as we get
- * the IB send completion on the RDMA op and the accompanying
- * message.
+ * Returns true if there are no messages on the send and retransmit queues
+ * which have a sequence number greater than or equal to the given sequence
+ * number.
  */
-void rds_rdma_send_complete(struct rds_message *rm, int status)
+int rds_send_acked_before(struct rds_connection *conn, u64 seq)
 {
-       struct rds_sock *rs = NULL;
-       struct rm_rdma_op *ro;
-       struct rds_notifier *notifier;
-       unsigned long flags;
+       struct rds_message *rm, *tmp;
+       int ret = 1;
 
-       spin_lock_irqsave(&rm->m_rs_lock, flags);
+       spin_lock(&conn->c_lock);
 
-       ro = &rm->rdma;
-       if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-           ro->op_active && ro->op_notify && ro->op_notifier) {
-               notifier = ro->op_notifier;
-               rs = rm->m_rs;
-               sock_hold(rds_rs_to_sk(rs));
-
-               notifier->n_status = status;
-               spin_lock(&rs->rs_lock);
-               list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
-               spin_unlock(&rs->rs_lock);
+       list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+               if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+                       ret = 0;
+               break;
+       }
 
-               ro->op_notifier = NULL;
+       list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+               if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+                       ret = 0;
+               break;
        }
 
-       spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+       spin_unlock(&conn->c_lock);
 
-       if (rs) {
-               rds_wake_sk_sleep(rs);
-               sock_put(rds_rs_to_sk(rs));
-       }
+       return ret;
 }
-EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
 
 /*
- * Just like above, except looks at atomic op
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
  */
-void rds_atomic_send_complete(struct rds_message *rm, int status)
+void rds_rdma_send_complete(struct rds_message *rm, int status)
 {
        struct rds_sock *rs = NULL;
-       struct rm_atomic_op *ao;
+       struct rds_rdma_op *ro;
        struct rds_notifier *notifier;
-       unsigned long flags;
 
-       spin_lock_irqsave(&rm->m_rs_lock, flags);
+       spin_lock(&rm->m_rs_lock);
 
-       ao = &rm->atomic;
+       ro = rm->m_rdma_op;
        if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
-           && ao->op_active && ao->op_notify && ao->op_notifier) {
-               notifier = ao->op_notifier;
+        && ro && ro->r_notify && ro->r_notifier) {
+               notifier = ro->r_notifier;
                rs = rm->m_rs;
                sock_hold(rds_rs_to_sk(rs));
 
@@ -505,17 +429,16 @@ void rds_atomic_send_complete(struct rds_message *rm, int status)
                list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
                spin_unlock(&rs->rs_lock);
 
-               ao->op_notifier = NULL;
+               ro->r_notifier = NULL;
        }
 
-       spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+       spin_unlock(&rm->m_rs_lock);
 
        if (rs) {
                rds_wake_sk_sleep(rs);
                sock_put(rds_rs_to_sk(rs));
        }
 }
-EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
 
 /*
  * This is the same as rds_rdma_send_complete except we
@@ -523,23 +446,15 @@ EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
  * socket, socket lock) and can just move the notifier.
  */
 static inline void
-__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
 {
-       struct rm_rdma_op *ro;
-       struct rm_atomic_op *ao;
-
-       ro = &rm->rdma;
-       if (ro->op_active && ro->op_notify && ro->op_notifier) {
-               ro->op_notifier->n_status = status;
-               list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
-               ro->op_notifier = NULL;
-       }
+       struct rds_rdma_op *ro;
 
-       ao = &rm->atomic;
-       if (ao->op_active && ao->op_notify && ao->op_notifier) {
-               ao->op_notifier->n_status = status;
-               list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
-               ao->op_notifier = NULL;
+       ro = rm->m_rdma_op;
+       if (ro && ro->r_notify && ro->r_notifier) {
+               ro->r_notifier->n_status = status;
+               list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
+               ro->r_notifier = NULL;
        }
 
        /* No need to wake the app - caller does this */
@@ -551,7 +466,7 @@ __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
  * So speed is not an issue here.
  */
 struct rds_message *rds_send_get_message(struct rds_connection *conn,
-                                        struct rm_rdma_op *op)
+                                        struct rds_rdma_op *op)
 {
        struct rds_message *rm, *tmp, *found = NULL;
        unsigned long flags;
@@ -559,7 +474,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
        spin_lock_irqsave(&conn->c_lock, flags);
 
        list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-               if (&rm->rdma == op) {
+               if (rm->m_rdma_op == op) {
                        atomic_inc(&rm->m_refcount);
                        found = rm;
                        goto out;
@@ -567,7 +482,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
        }
 
        list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-               if (&rm->rdma == op) {
+               if (rm->m_rdma_op == op) {
                        atomic_inc(&rm->m_refcount);
                        found = rm;
                        break;
@@ -579,7 +494,6 @@ out:
 
        return found;
 }
-EXPORT_SYMBOL_GPL(rds_send_get_message);
 
 /*
  * This removes messages from the socket's list if they're on it.  The list
@@ -589,15 +503,14 @@ EXPORT_SYMBOL_GPL(rds_send_get_message);
  * removing the messages from the 'messages' list regardless of if it found
  * the messages on the socket list or not.
  */
-static void rds_send_remove_from_sock(struct list_head *messages, int status)
+void rds_send_remove_from_sock(struct list_head *messages, int status)
 {
-       unsigned long flags;
+       unsigned long flags = 0; /* silence gcc :P */
        struct rds_sock *rs = NULL;
        struct rds_message *rm;
 
+       local_irq_save(flags);
        while (!list_empty(messages)) {
-               int was_on_sock = 0;
-
                rm = list_entry(messages->next, struct rds_message,
                                m_conn_item);
                list_del_init(&rm->m_conn_item);
@@ -612,55 +525,52 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status)
                 * while we're messing with it. It does not prevent the
                 * message from being removed from the socket, though.
                 */
-               spin_lock_irqsave(&rm->m_rs_lock, flags);
+               spin_lock(&rm->m_rs_lock);
                if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
                        goto unlock_and_drop;
 
                if (rs != rm->m_rs) {
                        if (rs) {
+                               spin_unlock(&rs->rs_lock);
                                rds_wake_sk_sleep(rs);
                                sock_put(rds_rs_to_sk(rs));
                        }
                        rs = rm->m_rs;
-                       if (rs)
-                               sock_hold(rds_rs_to_sk(rs));
+                       spin_lock(&rs->rs_lock);
+                       sock_hold(rds_rs_to_sk(rs));
                }
-               if (!rs)
-                       goto unlock_and_drop;
-               spin_lock(&rs->rs_lock);
 
                if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
-                       struct rm_rdma_op *ro = &rm->rdma;
+                       struct rds_rdma_op *ro = rm->m_rdma_op;
                        struct rds_notifier *notifier;
 
                        list_del_init(&rm->m_sock_item);
                        rds_send_sndbuf_remove(rs, rm);
 
-                       if (ro->op_active && ro->op_notifier &&
-                              (ro->op_notify || (ro->op_recverr && status))) {
-                               notifier = ro->op_notifier;
+                       if (ro && ro->r_notifier
+                          && (status || ro->r_notify)) {
+                               notifier = ro->r_notifier;
                                list_add_tail(&notifier->n_list,
                                                &rs->rs_notify_queue);
                                if (!notifier->n_status)
                                        notifier->n_status = status;
-                               rm->rdma.op_notifier = NULL;
+                               rm->m_rdma_op->r_notifier = NULL;
                        }
-                       was_on_sock = 1;
+                       rds_message_put(rm);
                        rm->m_rs = NULL;
                }
-               spin_unlock(&rs->rs_lock);
 
 unlock_and_drop:
-               spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+               spin_unlock(&rm->m_rs_lock);
                rds_message_put(rm);
-               if (was_on_sock)
-                       rds_message_put(rm);
        }
 
        if (rs) {
+               spin_unlock(&rs->rs_lock);
                rds_wake_sk_sleep(rs);
                sock_put(rds_rs_to_sk(rs));
        }
+       local_irq_restore(flags);
 }
 
 /*
@@ -670,6 +580,9 @@ unlock_and_drop:
  * queue. This means that in the TCP case, the message may not have been
  * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
  * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ *
+ * XXX It's not clear to me how this is safely serialized with socket
+ * destruction.  Maybe it should bail if it sees SOCK_DEAD.
  */
 void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
                         is_acked_func is_acked)
@@ -690,21 +603,21 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
 
        /* order flag updates with spin locks */
        if (!list_empty(&list))
-               smp_mb__after_atomic();
+               smp_mb__after_clear_bit();
 
        spin_unlock_irqrestore(&conn->c_lock, flags);
 
        /* now remove the messages from the sock list as needed */
        rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
 }
-EXPORT_SYMBOL_GPL(rds_send_drop_acked);
 
 void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 {
        struct rds_message *rm, *tmp;
        struct rds_connection *conn;
-       unsigned long flags;
+       unsigned long flags, flags2;
        LIST_HEAD(list);
+       int wake = 0;
 
        /* get all the messages we're dropping under the rs lock */
        spin_lock_irqsave(&rs->rs_lock, flags);
@@ -714,57 +627,58 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
                             dest->sin_port != rm->m_inc.i_hdr.h_dport))
                        continue;
 
+               wake = 1;
                list_move(&rm->m_sock_item, &list);
                rds_send_sndbuf_remove(rs, rm);
                clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+
+               /* If this is a RDMA operation, notify the app. */
+               __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
        }
 
        /* order flag updates with the rs lock */
-       smp_mb__after_atomic();
+       if (wake)
+               smp_mb__after_clear_bit();
 
        spin_unlock_irqrestore(&rs->rs_lock, flags);
 
-       if (list_empty(&list))
-               return;
+       if (wake)
+               rds_wake_sk_sleep(rs);
+
+       conn = NULL;
 
-       /* Remove the messages from the conn */
+       /* now remove the messages from the conn list as needed */
        list_for_each_entry(rm, &list, m_sock_item) {
+               /* We do this here rather than in the loop above, so that
+                * we don't have to nest m_rs_lock under rs->rs_lock */
+               spin_lock_irqsave(&rm->m_rs_lock, flags2);
+               rm->m_rs = NULL;
+               spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
 
-               conn = rm->m_inc.i_conn;
-
-               spin_lock_irqsave(&conn->c_lock, flags);
                /*
-                * Maybe someone else beat us to removing rm from the conn.
-                * If we race with their flag update we'll get the lock and
-                * then really see that the flag has been cleared.
+                * If we see this flag cleared then we're *sure* that someone
+                * else beat us to removing it from the conn.  If we race
+                * with their flag update we'll get the lock and then really
+                * see that the flag has been cleared.
                 */
-               if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
-                       spin_unlock_irqrestore(&conn->c_lock, flags);
-                       spin_lock_irqsave(&rm->m_rs_lock, flags);
-                       rm->m_rs = NULL;
-                       spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+               if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
                        continue;
-               }
-               list_del_init(&rm->m_conn_item);
-               spin_unlock_irqrestore(&conn->c_lock, flags);
 
-               /*
-                * Couldn't grab m_rs_lock in top loop (lock ordering),
-                * but we can now.
-                */
-               spin_lock_irqsave(&rm->m_rs_lock, flags);
-
-               spin_lock(&rs->rs_lock);
-               __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
-               spin_unlock(&rs->rs_lock);
-
-               rm->m_rs = NULL;
-               spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+               if (conn != rm->m_inc.i_conn) {
+                       if (conn)
+                               spin_unlock_irqrestore(&conn->c_lock, flags);
+                       conn = rm->m_inc.i_conn;
+                       spin_lock_irqsave(&conn->c_lock, flags);
+               }
 
-               rds_message_put(rm);
+               if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+                       list_del_init(&rm->m_conn_item);
+                       rds_message_put(rm);
+               }
        }
 
-       rds_wake_sk_sleep(rs);
+       if (conn)
+               spin_unlock_irqrestore(&conn->c_lock, flags);
 
        while (!list_empty(&list)) {
                rm = list_entry(list.next, struct rds_message, m_sock_item);
@@ -844,70 +758,13 @@ out:
        return *queued;
 }
 
-/*
- * rds_message is getting to be quite complicated, and we'd like to allocate
- * it all in one go. This figures out how big it needs to be up front.
- */
-static int rds_rm_size(struct msghdr *msg, int data_len)
-{
-       struct cmsghdr *cmsg;
-       int size = 0;
-       int cmsg_groups = 0;
-       int retval;
-
-       for_each_cmsghdr(cmsg, msg) {
-               if (!CMSG_OK(msg, cmsg))
-                       return -EINVAL;
-
-               if (cmsg->cmsg_level != SOL_RDS)
-                       continue;
-
-               switch (cmsg->cmsg_type) {
-               case RDS_CMSG_RDMA_ARGS:
-                       cmsg_groups |= 1;
-                       retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
-                       if (retval < 0)
-                               return retval;
-                       size += retval;
-
-                       break;
-
-               case RDS_CMSG_RDMA_DEST:
-               case RDS_CMSG_RDMA_MAP:
-                       cmsg_groups |= 2;
-                       /* these are valid but do no add any size */
-                       break;
-
-               case RDS_CMSG_ATOMIC_CSWP:
-               case RDS_CMSG_ATOMIC_FADD:
-               case RDS_CMSG_MASKED_ATOMIC_CSWP:
-               case RDS_CMSG_MASKED_ATOMIC_FADD:
-                       cmsg_groups |= 1;
-                       size += sizeof(struct scatterlist);
-                       break;
-
-               default:
-                       return -EINVAL;
-               }
-
-       }
-
-       size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
-
-       /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
-       if (cmsg_groups == 3)
-               return -EINVAL;
-
-       return size;
-}
-
 static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
                         struct msghdr *msg, int *allocated_mr)
 {
        struct cmsghdr *cmsg;
        int ret = 0;
 
-       for_each_cmsghdr(cmsg, msg) {
+       for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;
 
@@ -915,7 +772,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
                        continue;
 
                /* As a side effect, RDMA_DEST and RDMA_MAP will set
-                * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
+                * rm->m_rdma_cookie and rm->m_rdma_mr.
                 */
                switch (cmsg->cmsg_type) {
                case RDS_CMSG_RDMA_ARGS:
@@ -931,12 +788,6 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
                        if (!ret)
                                *allocated_mr = 1;
                        break;
-               case RDS_CMSG_ATOMIC_CSWP:
-               case RDS_CMSG_ATOMIC_FADD:
-               case RDS_CMSG_MASKED_ATOMIC_CSWP:
-               case RDS_CMSG_MASKED_ATOMIC_FADD:
-                       ret = rds_cmsg_atomic(rs, rm, cmsg);
-                       break;
 
                default:
                        return -EINVAL;
@@ -949,11 +800,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
        return ret;
 }
 
-int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+               size_t payload_len)
 {
        struct sock *sk = sock->sk;
        struct rds_sock *rs = rds_sk_to_rs(sk);
-       DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+       struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
        __be32 daddr;
        __be16 dport;
        struct rds_message *rm = NULL;
@@ -961,11 +813,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
        int ret = 0;
        int queued = 0, allocated_mr = 0;
        int nonblock = msg->msg_flags & MSG_DONTWAIT;
-       long timeo = sock_sndtimeo(sk, nonblock);
+       long timeo = sock_rcvtimeo(sk, nonblock);
 
        /* Mirror Linux UDP mirror of BSD error message compatibility */
        /* XXX: Perhaps MSG_MORE someday */
        if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+               printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
                ret = -EOPNOTSUPP;
                goto out;
        }
@@ -992,32 +845,20 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                goto out;
        }
 
-       /* size of rm including all sgs */
-       ret = rds_rm_size(msg, payload_len);
-       if (ret < 0)
+       rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
+       if (IS_ERR(rm)) {
+               ret = PTR_ERR(rm);
+               rm = NULL;
                goto out;
-
-       rm = rds_message_alloc(ret, GFP_KERNEL);
-       if (!rm) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /* Attach data to the rm */
-       if (payload_len) {
-               rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
-               if (!rm->data.op_sg) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
-               ret = rds_message_copy_from_user(rm, &msg->msg_iter);
-               if (ret)
-                       goto out;
        }
-       rm->data.op_active = 1;
 
        rm->m_daddr = daddr;
 
+       /* Parse any control messages the user may have included. */
+       ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+       if (ret)
+               goto out;
+
        /* rds_conn_create has a spinlock that runs with IRQ off.
         * Caching the conn in the socket helps a lot. */
        if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
@@ -1033,32 +874,26 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                rs->rs_conn = conn;
        }
 
-       /* Parse any control messages the user may have included. */
-       ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
-       if (ret)
-               goto out;
-
-       if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
-               printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-                              &rm->rdma, conn->c_trans->xmit_rdma);
-               ret = -EOPNOTSUPP;
-               goto out;
-       }
-
-       if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
-               printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
-                              &rm->atomic, conn->c_trans->xmit_atomic);
+       if ((rm->m_rdma_cookie || rm->m_rdma_op)
+        && conn->c_trans->xmit_rdma == NULL) {
+               if (printk_ratelimit())
+                       printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+                               rm->m_rdma_op, conn->c_trans->xmit_rdma);
                ret = -EOPNOTSUPP;
                goto out;
        }
 
-       rds_conn_connect_if_down(conn);
+       /* If the connection is down, trigger a connect. We may
+        * have scheduled a delayed reconnect however - in this case
+        * we should not interfere.
+        */
+       if (rds_conn_state(conn) == RDS_CONN_DOWN
+        && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+               queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
 
        ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
-       if (ret) {
-               rs->rs_seen_congestion = 1;
+       if (ret)
                goto out;
-       }
 
        while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
                                  dport, &queued)) {
@@ -1073,7 +908,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                        goto out;
                }
 
-               timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+               timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
                                        rds_send_queue_rm(rs, conn, rm,
                                                          rs->rs_bound_port,
                                                          dport,
@@ -1096,7 +931,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
        rds_stats_inc(s_send_queued);
 
        if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
-               rds_send_xmit(conn);
+               rds_send_worker(&conn->c_send_w.work);
 
        rds_message_put(rm);
        return payload_len;
@@ -1124,15 +959,20 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
        int ret = 0;
 
        rm = rds_message_alloc(0, GFP_ATOMIC);
-       if (!rm) {
+       if (rm == NULL) {
                ret = -ENOMEM;
                goto out;
        }
 
        rm->m_daddr = conn->c_faddr;
-       rm->data.op_active = 1;
 
-       rds_conn_connect_if_down(conn);
+       /* If the connection is down, trigger a connect. We may
+        * have scheduled a delayed reconnect however - in this case
+        * we should not interfere.
+        */
+       if (rds_conn_state(conn) == RDS_CONN_DOWN
+        && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+               queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
 
        ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
        if (ret)
@@ -1152,9 +992,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
        rds_stats_inc(s_send_queued);
        rds_stats_inc(s_send_pong);
 
-       if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
-               queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
+       queue_delayed_work(rds_wq, &conn->c_send_w, 0);
        rds_message_put(rm);
        return 0;
 
index 73be187d389ed044c886d22e4960905283d9de87..637146893cf3e24286404e43c1400492c5437cb3 100644 (file)
 #include <linux/percpu.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
-#include <linux/export.h>
 
 #include "rds.h"
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
-EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
 
 /* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
 
-static const char *const rds_stat_names[] = {
+static char *rds_stat_names[] = {
        "conn_reset",
        "recv_drop_bad_checksum",
        "recv_drop_old_seq",
@@ -58,8 +56,8 @@ static const char *const rds_stat_names[] = {
        "recv_ping",
        "send_queue_empty",
        "send_queue_full",
-       "send_lock_contention",
-       "send_lock_queue_raced",
+       "send_sem_contention",
+       "send_sem_queue_raced",
        "send_immediate_retry",
        "send_delayed_retry",
        "send_drop_acked",
@@ -79,7 +77,7 @@ static const char *const rds_stat_names[] = {
 };
 
 void rds_stats_info_copy(struct rds_info_iterator *iter,
-                        uint64_t *values, const char *const *names, size_t nr)
+                        uint64_t *values, char **names, size_t nr)
 {
        struct rds_info_counter ctr;
        size_t i;
@@ -87,13 +85,11 @@ void rds_stats_info_copy(struct rds_info_iterator *iter,
        for (i = 0; i < nr; i++) {
                BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
                strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
-               ctr.name[sizeof(ctr.name) - 1] = '\0';
                ctr.value = values[i];
 
                rds_info_copy(iter, &ctr, sizeof(ctr));
        }
 }
-EXPORT_SYMBOL_GPL(rds_stats_info_copy);
 
 /*
  * This gives global counters across all the transports.  The strings
@@ -145,7 +141,7 @@ void rds_stats_exit(void)
        rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
 }
 
-int rds_stats_init(void)
+int __init rds_stats_init(void)
 {
        rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
        return 0;
index c173f69e1479bfaf643b9e5c69f4c9a151b18c67..307dc5c1be153d3326dbd0fb62e4874241fc559a 100644 (file)
@@ -49,61 +49,74 @@ unsigned int  rds_sysctl_max_unacked_bytes = (16 << 20);
 
 unsigned int rds_sysctl_ping_enable = 1;
 
-static struct ctl_table rds_sysctl_rds_table[] = {
+static ctl_table rds_sysctl_rds_table[] = {
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "reconnect_min_delay_ms",
                .data           = &rds_sysctl_reconnect_min_jiffies,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+               .proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
                .extra1         = &rds_sysctl_reconnect_min,
                .extra2         = &rds_sysctl_reconnect_max_jiffies,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "reconnect_max_delay_ms",
                .data           = &rds_sysctl_reconnect_max_jiffies,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+               .proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
                .extra1         = &rds_sysctl_reconnect_min_jiffies,
                .extra2         = &rds_sysctl_reconnect_max,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_unacked_packets",
                .data           = &rds_sysctl_max_unacked_packets,
-               .maxlen         = sizeof(int),
+               .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max_unacked_bytes",
                .data           = &rds_sysctl_max_unacked_bytes,
-               .maxlen         = sizeof(int),
+               .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
                .procname       = "ping_enable",
                .data           = &rds_sysctl_ping_enable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
        },
+       { .ctl_name = 0}
+};
+
+static struct ctl_path rds_sysctl_path[] = {
+       { .procname = "net", .ctl_name = CTL_NET, },
+       { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
        { }
 };
 
+
 void rds_sysctl_exit(void)
 {
-       unregister_net_sysctl_table(rds_sysctl_reg_table);
+       if (rds_sysctl_reg_table)
+               unregister_sysctl_table(rds_sysctl_reg_table);
 }
 
-int rds_sysctl_init(void)
+int __init rds_sysctl_init(void)
 {
        rds_sysctl_reconnect_min = msecs_to_jiffies(1);
        rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
 
-       rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table);
-       if (!rds_sysctl_reg_table)
+       rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
+       if (rds_sysctl_reg_table == NULL)
                return -ENOMEM;
        return 0;
 }
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
deleted file mode 100644 (file)
index edac9ef..0000000
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/in.h>
-#include <linux/module.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-/* only for info exporting */
-static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
-static LIST_HEAD(rds_tcp_tc_list);
-static unsigned int rds_tcp_tc_count;
-
-/* Track rds_tcp_connection structs so they can be cleaned up */
-static DEFINE_SPINLOCK(rds_tcp_conn_lock);
-static LIST_HEAD(rds_tcp_conn_list);
-
-static struct kmem_cache *rds_tcp_conn_slab;
-
-#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
-
-/* doing it this way avoids calling tcp_sk() */
-void rds_tcp_nonagle(struct socket *sock)
-{
-       mm_segment_t oldfs = get_fs();
-       int val = 1;
-
-       set_fs(KERNEL_DS);
-       sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
-                             sizeof(val));
-       set_fs(oldfs);
-}
-
-void rds_tcp_tune(struct socket *sock)
-{
-       struct sock *sk = sock->sk;
-
-       rds_tcp_nonagle(sock);
-
-       /*
-        * We're trying to saturate gigabit with the default,
-        * see svc_sock_setbufsize().
-        */
-       lock_sock(sk);
-       sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
-       sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
-       sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
-       release_sock(sk);
-}
-
-u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
-{
-       return tcp_sk(tc->t_sock->sk)->snd_nxt;
-}
-
-u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
-{
-       return tcp_sk(tc->t_sock->sk)->snd_una;
-}
-
-void rds_tcp_restore_callbacks(struct socket *sock,
-                              struct rds_tcp_connection *tc)
-{
-       rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
-       write_lock_bh(&sock->sk->sk_callback_lock);
-
-       /* done under the callback_lock to serialize with write_space */
-       spin_lock(&rds_tcp_tc_list_lock);
-       list_del_init(&tc->t_list_item);
-       rds_tcp_tc_count--;
-       spin_unlock(&rds_tcp_tc_list_lock);
-
-       tc->t_sock = NULL;
-
-       sock->sk->sk_write_space = tc->t_orig_write_space;
-       sock->sk->sk_data_ready = tc->t_orig_data_ready;
-       sock->sk->sk_state_change = tc->t_orig_state_change;
-       sock->sk->sk_user_data = NULL;
-
-       write_unlock_bh(&sock->sk->sk_callback_lock);
-}
-
-/*
- * This is the only path that sets tc->t_sock.  Send and receive trust that
- * it is set.  The RDS_CONN_CONNECTED bit protects those paths from being
- * called while it isn't set.
- */
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-
-       rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
-       write_lock_bh(&sock->sk->sk_callback_lock);
-
-       /* done under the callback_lock to serialize with write_space */
-       spin_lock(&rds_tcp_tc_list_lock);
-       list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
-       rds_tcp_tc_count++;
-       spin_unlock(&rds_tcp_tc_list_lock);
-
-       /* accepted sockets need our listen data ready undone */
-       if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
-               sock->sk->sk_data_ready = sock->sk->sk_user_data;
-
-       tc->t_sock = sock;
-       tc->conn = conn;
-       tc->t_orig_data_ready = sock->sk->sk_data_ready;
-       tc->t_orig_write_space = sock->sk->sk_write_space;
-       tc->t_orig_state_change = sock->sk->sk_state_change;
-
-       sock->sk->sk_user_data = conn;
-       sock->sk->sk_data_ready = rds_tcp_data_ready;
-       sock->sk->sk_write_space = rds_tcp_write_space;
-       sock->sk->sk_state_change = rds_tcp_state_change;
-
-       write_unlock_bh(&sock->sk->sk_callback_lock);
-}
-
-static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
-                           struct rds_info_iterator *iter,
-                           struct rds_info_lengths *lens)
-{
-       struct rds_info_tcp_socket tsinfo;
-       struct rds_tcp_connection *tc;
-       unsigned long flags;
-       struct sockaddr_in sin;
-       int sinlen;
-
-       spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
-
-       if (len / sizeof(tsinfo) < rds_tcp_tc_count)
-               goto out;
-
-       list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
-
-               sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
-               tsinfo.local_addr = sin.sin_addr.s_addr;
-               tsinfo.local_port = sin.sin_port;
-               sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
-               tsinfo.peer_addr = sin.sin_addr.s_addr;
-               tsinfo.peer_port = sin.sin_port;
-
-               tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
-               tsinfo.data_rem = tc->t_tinc_data_rem;
-               tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
-               tsinfo.last_expected_una = tc->t_last_expected_una;
-               tsinfo.last_seen_una = tc->t_last_seen_una;
-
-               rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
-       }
-
-out:
-       lens->nr = rds_tcp_tc_count;
-       lens->each = sizeof(tsinfo);
-
-       spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
-}
-
-static int rds_tcp_laddr_check(__be32 addr)
-{
-       if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
-               return 0;
-       return -EADDRNOTAVAIL;
-}
-
-static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
-{
-       struct rds_tcp_connection *tc;
-
-       tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
-       if (!tc)
-               return -ENOMEM;
-
-       tc->t_sock = NULL;
-       tc->t_tinc = NULL;
-       tc->t_tinc_hdr_rem = sizeof(struct rds_header);
-       tc->t_tinc_data_rem = 0;
-
-       conn->c_transport_data = tc;
-
-       spin_lock_irq(&rds_tcp_conn_lock);
-       list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
-       spin_unlock_irq(&rds_tcp_conn_lock);
-
-       rdsdebug("alloced tc %p\n", conn->c_transport_data);
-       return 0;
-}
-
-static void rds_tcp_conn_free(void *arg)
-{
-       struct rds_tcp_connection *tc = arg;
-       unsigned long flags;
-       rdsdebug("freeing tc %p\n", tc);
-
-       spin_lock_irqsave(&rds_tcp_conn_lock, flags);
-       list_del(&tc->t_tcp_node);
-       spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
-
-       kmem_cache_free(rds_tcp_conn_slab, tc);
-}
-
-static void rds_tcp_destroy_conns(void)
-{
-       struct rds_tcp_connection *tc, *_tc;
-       LIST_HEAD(tmp_list);
-
-       /* avoid calling conn_destroy with irqs off */
-       spin_lock_irq(&rds_tcp_conn_lock);
-       list_splice(&rds_tcp_conn_list, &tmp_list);
-       INIT_LIST_HEAD(&rds_tcp_conn_list);
-       spin_unlock_irq(&rds_tcp_conn_lock);
-
-       list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
-               if (tc->conn->c_passive)
-                       rds_conn_destroy(tc->conn->c_passive);
-               rds_conn_destroy(tc->conn);
-       }
-}
-
-static void rds_tcp_exit(void)
-{
-       rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-       rds_tcp_listen_stop();
-       rds_tcp_destroy_conns();
-       rds_trans_unregister(&rds_tcp_transport);
-       rds_tcp_recv_exit();
-       kmem_cache_destroy(rds_tcp_conn_slab);
-}
-module_exit(rds_tcp_exit);
-
-struct rds_transport rds_tcp_transport = {
-       .laddr_check            = rds_tcp_laddr_check,
-       .xmit_prepare           = rds_tcp_xmit_prepare,
-       .xmit_complete          = rds_tcp_xmit_complete,
-       .xmit                   = rds_tcp_xmit,
-       .recv                   = rds_tcp_recv,
-       .conn_alloc             = rds_tcp_conn_alloc,
-       .conn_free              = rds_tcp_conn_free,
-       .conn_connect           = rds_tcp_conn_connect,
-       .conn_shutdown          = rds_tcp_conn_shutdown,
-       .inc_copy_to_user       = rds_tcp_inc_copy_to_user,
-       .inc_free               = rds_tcp_inc_free,
-       .stats_info_copy        = rds_tcp_stats_info_copy,
-       .exit                   = rds_tcp_exit,
-       .t_owner                = THIS_MODULE,
-       .t_name                 = "tcp",
-       .t_type                 = RDS_TRANS_TCP,
-       .t_prefer_loopback      = 1,
-};
-
-static int rds_tcp_init(void)
-{
-       int ret;
-
-       rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
-                                             sizeof(struct rds_tcp_connection),
-                                             0, 0, NULL);
-       if (!rds_tcp_conn_slab) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       ret = rds_tcp_recv_init();
-       if (ret)
-               goto out_slab;
-
-       ret = rds_trans_register(&rds_tcp_transport);
-       if (ret)
-               goto out_recv;
-
-       ret = rds_tcp_listen_init();
-       if (ret)
-               goto out_register;
-
-       rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-
-       goto out;
-
-out_register:
-       rds_trans_unregister(&rds_tcp_transport);
-out_recv:
-       rds_tcp_recv_exit();
-out_slab:
-       kmem_cache_destroy(rds_tcp_conn_slab);
-out:
-       return ret;
-}
-module_init(rds_tcp_init);
-
-MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
-MODULE_DESCRIPTION("RDS: TCP transport");
-MODULE_LICENSE("Dual BSD/GPL");
-
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
deleted file mode 100644 (file)
index 0dbdd37..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef _RDS_TCP_H
-#define _RDS_TCP_H
-
-#define RDS_TCP_PORT   16385
-
-struct rds_tcp_incoming {
-       struct rds_incoming     ti_inc;
-       struct sk_buff_head     ti_skb_list;
-};
-
-struct rds_tcp_connection {
-
-       struct list_head        t_tcp_node;
-       struct rds_connection   *conn;
-       struct socket           *t_sock;
-       void                    *t_orig_write_space;
-       void                    *t_orig_data_ready;
-       void                    *t_orig_state_change;
-
-       struct rds_tcp_incoming *t_tinc;
-       size_t                  t_tinc_hdr_rem;
-       size_t                  t_tinc_data_rem;
-
-       /* XXX error report? */
-       struct work_struct      t_conn_w;
-       struct work_struct      t_send_w;
-       struct work_struct      t_down_w;
-       struct work_struct      t_recv_w;
-
-       /* for info exporting only */
-       struct list_head        t_list_item;
-       u32                     t_last_sent_nxt;
-       u32                     t_last_expected_una;
-       u32                     t_last_seen_una;
-};
-
-struct rds_tcp_statistics {
-       uint64_t        s_tcp_data_ready_calls;
-       uint64_t        s_tcp_write_space_calls;
-       uint64_t        s_tcp_sndbuf_full;
-       uint64_t        s_tcp_connect_raced;
-       uint64_t        s_tcp_listen_closed_stale;
-};
-
-/* tcp.c */
-void rds_tcp_tune(struct socket *sock);
-void rds_tcp_nonagle(struct socket *sock);
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
-void rds_tcp_restore_callbacks(struct socket *sock,
-                              struct rds_tcp_connection *tc);
-u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
-u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
-u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
-extern struct rds_transport rds_tcp_transport;
-
-/* tcp_connect.c */
-int rds_tcp_conn_connect(struct rds_connection *conn);
-void rds_tcp_conn_shutdown(struct rds_connection *conn);
-void rds_tcp_state_change(struct sock *sk);
-
-/* tcp_listen.c */
-int rds_tcp_listen_init(void);
-void rds_tcp_listen_stop(void);
-void rds_tcp_listen_data_ready(struct sock *sk);
-
-/* tcp_recv.c */
-int rds_tcp_recv_init(void);
-void rds_tcp_recv_exit(void);
-void rds_tcp_data_ready(struct sock *sk);
-int rds_tcp_recv(struct rds_connection *conn);
-void rds_tcp_inc_free(struct rds_incoming *inc);
-int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
-
-/* tcp_send.c */
-void rds_tcp_xmit_prepare(struct rds_connection *conn);
-void rds_tcp_xmit_complete(struct rds_connection *conn);
-int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
-                unsigned int hdr_off, unsigned int sg, unsigned int off);
-void rds_tcp_write_space(struct sock *sk);
-
-/* tcp_stats.c */
-DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
-#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
-unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
-                                    unsigned int avail);
-
-#endif
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
deleted file mode 100644 (file)
index 973109c..0000000
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-void rds_tcp_state_change(struct sock *sk)
-{
-       void (*state_change)(struct sock *sk);
-       struct rds_connection *conn;
-       struct rds_tcp_connection *tc;
-
-       read_lock(&sk->sk_callback_lock);
-       conn = sk->sk_user_data;
-       if (!conn) {
-               state_change = sk->sk_state_change;
-               goto out;
-       }
-       tc = conn->c_transport_data;
-       state_change = tc->t_orig_state_change;
-
-       rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
-
-       switch(sk->sk_state) {
-               /* ignore connecting sockets as they make progress */
-               case TCP_SYN_SENT:
-               case TCP_SYN_RECV:
-                       break;
-               case TCP_ESTABLISHED:
-                       rds_connect_complete(conn);
-                       break;
-               case TCP_CLOSE_WAIT:
-               case TCP_CLOSE:
-                       rds_conn_drop(conn);
-               default:
-                       break;
-       }
-out:
-       read_unlock(&sk->sk_callback_lock);
-       state_change(sk);
-}
-
-int rds_tcp_conn_connect(struct rds_connection *conn)
-{
-       struct socket *sock = NULL;
-       struct sockaddr_in src, dest;
-       int ret;
-
-       ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-       if (ret < 0)
-               goto out;
-
-       rds_tcp_tune(sock);
-
-       src.sin_family = AF_INET;
-       src.sin_addr.s_addr = (__force u32)conn->c_laddr;
-       src.sin_port = (__force u16)htons(0);
-
-       ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
-       if (ret) {
-               rdsdebug("bind failed with %d at address %pI4\n",
-                        ret, &conn->c_laddr);
-               goto out;
-       }
-
-       dest.sin_family = AF_INET;
-       dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
-       dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
-
-       /*
-        * once we call connect() we can start getting callbacks and they
-        * own the socket
-        */
-       rds_tcp_set_callbacks(sock, conn);
-       ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
-                                O_NONBLOCK);
-
-       rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
-       if (ret == -EINPROGRESS)
-               ret = 0;
-       if (ret == 0)
-               sock = NULL;
-       else
-               rds_tcp_restore_callbacks(sock, conn->c_transport_data);
-
-out:
-       if (sock)
-               sock_release(sock);
-       return ret;
-}
-
-/*
- * Before killing the tcp socket this needs to serialize with callbacks.  The
- * caller has already grabbed the sending sem so we're serialized with other
- * senders.
- *
- * TCP calls the callbacks with the sock lock so we hold it while we reset the
- * callbacks to those set by TCP.  Our callbacks won't execute again once we
- * hold the sock lock.
- */
-void rds_tcp_conn_shutdown(struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       struct socket *sock = tc->t_sock;
-
-       rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
-
-       if (sock) {
-               sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
-               lock_sock(sock->sk);
-               rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
-
-               release_sock(sock->sk);
-               sock_release(sock);
-       }
-
-       if (tc->t_tinc) {
-               rds_inc_put(&tc->t_tinc->ti_inc);
-               tc->t_tinc = NULL;
-       }
-       tc->t_tinc_hdr_rem = sizeof(struct rds_header);
-       tc->t_tinc_data_rem = 0;
-}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
deleted file mode 100644 (file)
index 0da49e3..0000000
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/in.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-/*
- * cheesy, but simple..
- */
-static void rds_tcp_accept_worker(struct work_struct *work);
-static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
-static struct socket *rds_tcp_listen_sock;
-
-static int rds_tcp_keepalive(struct socket *sock)
-{
-       /* values below based on xs_udp_default_timeout */
-       int keepidle = 5; /* send a probe 'keepidle' secs after last data */
-       int keepcnt = 5; /* number of unack'ed probes before declaring dead */
-       int keepalive = 1;
-       int ret = 0;
-
-       ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-                               (char *)&keepalive, sizeof(keepalive));
-       if (ret < 0)
-               goto bail;
-
-       ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
-                               (char *)&keepcnt, sizeof(keepcnt));
-       if (ret < 0)
-               goto bail;
-
-       ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
-                               (char *)&keepidle, sizeof(keepidle));
-       if (ret < 0)
-               goto bail;
-
-       /* KEEPINTVL is the interval between successive probes. We follow
-        * the model in xs_tcp_finish_connecting() and re-use keepidle.
-        */
-       ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
-                               (char *)&keepidle, sizeof(keepidle));
-bail:
-       return ret;
-}
-
-static int rds_tcp_accept_one(struct socket *sock)
-{
-       struct socket *new_sock = NULL;
-       struct rds_connection *conn;
-       int ret;
-       struct inet_sock *inet;
-       struct rds_tcp_connection *rs_tcp;
-
-       ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
-                              sock->sk->sk_protocol, &new_sock);
-       if (ret)
-               goto out;
-
-       new_sock->type = sock->type;
-       new_sock->ops = sock->ops;
-       ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
-       if (ret < 0)
-               goto out;
-
-       ret = rds_tcp_keepalive(new_sock);
-       if (ret < 0)
-               goto out;
-
-       rds_tcp_tune(new_sock);
-
-       inet = inet_sk(new_sock->sk);
-
-       rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
-                &inet->inet_saddr, ntohs(inet->inet_sport),
-                &inet->inet_daddr, ntohs(inet->inet_dport));
-
-       conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
-                              &rds_tcp_transport, GFP_KERNEL);
-       if (IS_ERR(conn)) {
-               ret = PTR_ERR(conn);
-               goto out;
-       }
-       /* An incoming SYN request came in, and TCP just accepted it.
-        * We always create a new conn for listen side of TCP, and do not
-        * add it to the c_hash_list.
-        *
-        * If the client reboots, this conn will need to be cleaned up.
-        * rds_tcp_state_change() will do that cleanup
-        */
-       rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
-       WARN_ON(!rs_tcp || rs_tcp->t_sock);
-
-       /*
-        * see the comment above rds_queue_delayed_reconnect()
-        */
-       if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
-               if (rds_conn_state(conn) == RDS_CONN_UP)
-                       rds_tcp_stats_inc(s_tcp_listen_closed_stale);
-               else
-                       rds_tcp_stats_inc(s_tcp_connect_raced);
-               rds_conn_drop(conn);
-               ret = 0;
-               goto out;
-       }
-
-       rds_tcp_set_callbacks(new_sock, conn);
-       rds_connect_complete(conn);
-       new_sock = NULL;
-       ret = 0;
-
-out:
-       if (new_sock)
-               sock_release(new_sock);
-       return ret;
-}
-
-static void rds_tcp_accept_worker(struct work_struct *work)
-{
-       while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
-               cond_resched();
-}
-
-void rds_tcp_listen_data_ready(struct sock *sk)
-{
-       void (*ready)(struct sock *sk);
-
-       rdsdebug("listen data ready sk %p\n", sk);
-
-       read_lock(&sk->sk_callback_lock);
-       ready = sk->sk_user_data;
-       if (!ready) { /* check for teardown race */
-               ready = sk->sk_data_ready;
-               goto out;
-       }
-
-       /*
-        * ->sk_data_ready is also called for a newly established child socket
-        * before it has been accepted and the accepter has set up their
-        * data_ready.. we only want to queue listen work for our listening
-        * socket
-        */
-       if (sk->sk_state == TCP_LISTEN)
-               queue_work(rds_wq, &rds_tcp_listen_work);
-
-out:
-       read_unlock(&sk->sk_callback_lock);
-       ready(sk);
-}
-
-int rds_tcp_listen_init(void)
-{
-       struct sockaddr_in sin;
-       struct socket *sock = NULL;
-       int ret;
-
-       ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-       if (ret < 0)
-               goto out;
-
-       sock->sk->sk_reuse = SK_CAN_REUSE;
-       rds_tcp_nonagle(sock);
-
-       write_lock_bh(&sock->sk->sk_callback_lock);
-       sock->sk->sk_user_data = sock->sk->sk_data_ready;
-       sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
-       write_unlock_bh(&sock->sk->sk_callback_lock);
-
-       sin.sin_family = PF_INET;
-       sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
-       sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
-
-       ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
-       if (ret < 0)
-               goto out;
-
-       ret = sock->ops->listen(sock, 64);
-       if (ret < 0)
-               goto out;
-
-       rds_tcp_listen_sock = sock;
-       sock = NULL;
-out:
-       if (sock)
-               sock_release(sock);
-       return ret;
-}
-
-void rds_tcp_listen_stop(void)
-{
-       struct socket *sock = rds_tcp_listen_sock;
-       struct sock *sk;
-
-       if (!sock)
-               return;
-
-       sk = sock->sk;
-
-       /* serialize with and prevent further callbacks */
-       lock_sock(sk);
-       write_lock_bh(&sk->sk_callback_lock);
-       if (sk->sk_user_data) {
-               sk->sk_data_ready = sk->sk_user_data;
-               sk->sk_user_data = NULL;
-       }
-       write_unlock_bh(&sk->sk_callback_lock);
-       release_sock(sk);
-
-       /* wait for accepts to stop and close the socket */
-       flush_workqueue(rds_wq);
-       sock_release(sock);
-       rds_tcp_listen_sock = NULL;
-}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
deleted file mode 100644 (file)
index fbc5ef8..0000000
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-static struct kmem_cache *rds_tcp_incoming_slab;
-
-static void rds_tcp_inc_purge(struct rds_incoming *inc)
-{
-       struct rds_tcp_incoming *tinc;
-       tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
-       rdsdebug("purging tinc %p inc %p\n", tinc, inc);
-       skb_queue_purge(&tinc->ti_skb_list);
-}
-
-void rds_tcp_inc_free(struct rds_incoming *inc)
-{
-       struct rds_tcp_incoming *tinc;
-       tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
-       rds_tcp_inc_purge(inc);
-       rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
-       kmem_cache_free(rds_tcp_incoming_slab, tinc);
-}
-
-/*
- * this is pretty lame, but, whatever.
- */
-int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
-{
-       struct rds_tcp_incoming *tinc;
-       struct sk_buff *skb;
-       int ret = 0;
-
-       if (!iov_iter_count(to))
-               goto out;
-
-       tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
-
-       skb_queue_walk(&tinc->ti_skb_list, skb) {
-               unsigned long to_copy, skb_off;
-               for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
-                       to_copy = iov_iter_count(to);
-                       to_copy = min(to_copy, skb->len - skb_off);
-
-                       if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
-                               return -EFAULT;
-
-                       rds_stats_add(s_copy_to_user, to_copy);
-                       ret += to_copy;
-
-                       if (!iov_iter_count(to))
-                               goto out;
-               }
-       }
-out:
-       return ret;
-}
-
-/*
- * We have a series of skbs that have fragmented pieces of the congestion
- * bitmap.  They must add up to the exact size of the congestion bitmap.  We
- * use the skb helpers to copy those into the pages that make up the in-memory
- * congestion bitmap for the remote address of this connection.  We then tell
- * the congestion core that the bitmap has been changed so that it can wake up
- * sleepers.
- *
- * This is racing with sending paths which are using test_bit to see if the
- * bitmap indicates that their recipient is congested.
- */
-
-static void rds_tcp_cong_recv(struct rds_connection *conn,
-                             struct rds_tcp_incoming *tinc)
-{
-       struct sk_buff *skb;
-       unsigned int to_copy, skb_off;
-       unsigned int map_off;
-       unsigned int map_page;
-       struct rds_cong_map *map;
-       int ret;
-
-       /* catch completely corrupt packets */
-       if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
-               return;
-
-       map_page = 0;
-       map_off = 0;
-       map = conn->c_fcong;
-
-       skb_queue_walk(&tinc->ti_skb_list, skb) {
-               skb_off = 0;
-               while (skb_off < skb->len) {
-                       to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
-                                       skb->len - skb_off);
-
-                       BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
-
-                       /* only returns 0 or -error */
-                       ret = skb_copy_bits(skb, skb_off,
-                               (void *)map->m_page_addrs[map_page] + map_off,
-                               to_copy);
-                       BUG_ON(ret != 0);
-
-                       skb_off += to_copy;
-                       map_off += to_copy;
-                       if (map_off == PAGE_SIZE) {
-                               map_off = 0;
-                               map_page++;
-                       }
-               }
-       }
-
-       rds_cong_map_updated(map, ~(u64) 0);
-}
-
-struct rds_tcp_desc_arg {
-       struct rds_connection *conn;
-       gfp_t gfp;
-};
-
-static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
-                            unsigned int offset, size_t len)
-{
-       struct rds_tcp_desc_arg *arg = desc->arg.data;
-       struct rds_connection *conn = arg->conn;
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       struct rds_tcp_incoming *tinc = tc->t_tinc;
-       struct sk_buff *clone;
-       size_t left = len, to_copy;
-
-       rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
-                len);
-
-       /*
-        * tcp_read_sock() interprets partial progress as an indication to stop
-        * processing.
-        */
-       while (left) {
-               if (!tinc) {
-                       tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
-                                               arg->gfp);
-                       if (!tinc) {
-                               desc->error = -ENOMEM;
-                               goto out;
-                       }
-                       tc->t_tinc = tinc;
-                       rdsdebug("alloced tinc %p\n", tinc);
-                       rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
-                       /*
-                        * XXX * we might be able to use the __ variants when
-                        * we've already serialized at a higher level.
-                        */
-                       skb_queue_head_init(&tinc->ti_skb_list);
-               }
-
-               if (left && tc->t_tinc_hdr_rem) {
-                       to_copy = min(tc->t_tinc_hdr_rem, left);
-                       rdsdebug("copying %zu header from skb %p\n", to_copy,
-                                skb);
-                       skb_copy_bits(skb, offset,
-                                     (char *)&tinc->ti_inc.i_hdr +
-                                               sizeof(struct rds_header) -
-                                               tc->t_tinc_hdr_rem,
-                                     to_copy);
-                       tc->t_tinc_hdr_rem -= to_copy;
-                       left -= to_copy;
-                       offset += to_copy;
-
-                       if (tc->t_tinc_hdr_rem == 0) {
-                               /* could be 0 for a 0 len message */
-                               tc->t_tinc_data_rem =
-                                       be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
-                       }
-               }
-
-               if (left && tc->t_tinc_data_rem) {
-                       clone = skb_clone(skb, arg->gfp);
-                       if (!clone) {
-                               desc->error = -ENOMEM;
-                               goto out;
-                       }
-
-                       to_copy = min(tc->t_tinc_data_rem, left);
-                       pskb_pull(clone, offset);
-                       pskb_trim(clone, to_copy);
-                       skb_queue_tail(&tinc->ti_skb_list, clone);
-
-                       rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
-                                "clone %p data %p len %d\n",
-                                skb, skb->data, skb->len, offset, to_copy,
-                                clone, clone->data, clone->len);
-
-                       tc->t_tinc_data_rem -= to_copy;
-                       left -= to_copy;
-                       offset += to_copy;
-               }
-
-               if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
-                       if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
-                               rds_tcp_cong_recv(conn, tinc);
-                       else
-                               rds_recv_incoming(conn, conn->c_faddr,
-                                                 conn->c_laddr, &tinc->ti_inc,
-                                                 arg->gfp);
-
-                       tc->t_tinc_hdr_rem = sizeof(struct rds_header);
-                       tc->t_tinc_data_rem = 0;
-                       tc->t_tinc = NULL;
-                       rds_inc_put(&tinc->ti_inc);
-                       tinc = NULL;
-               }
-       }
-out:
-       rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
-                len, left, skb->len,
-                skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
-       return len - left;
-}
-
-/* the caller has to hold the sock lock */
-static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       struct socket *sock = tc->t_sock;
-       read_descriptor_t desc;
-       struct rds_tcp_desc_arg arg;
-
-       /* It's like glib in the kernel! */
-       arg.conn = conn;
-       arg.gfp = gfp;
-       desc.arg.data = &arg;
-       desc.error = 0;
-       desc.count = 1; /* give more than one skb per call */
-
-       tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
-       rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
-                desc.error);
-
-       return desc.error;
-}
-
-/*
- * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
- * data_ready.
- *
- * if we fail to allocate we're in trouble.. blindly wait some time before
- * trying again to see if the VM can free up something for us.
- */
-int rds_tcp_recv(struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       struct socket *sock = tc->t_sock;
-       int ret = 0;
-
-       rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
-
-       lock_sock(sock->sk);
-       ret = rds_tcp_read_sock(conn, GFP_KERNEL);
-       release_sock(sock->sk);
-
-       return ret;
-}
-
-void rds_tcp_data_ready(struct sock *sk)
-{
-       void (*ready)(struct sock *sk);
-       struct rds_connection *conn;
-       struct rds_tcp_connection *tc;
-
-       rdsdebug("data ready sk %p\n", sk);
-
-       read_lock(&sk->sk_callback_lock);
-       conn = sk->sk_user_data;
-       if (!conn) { /* check for teardown race */
-               ready = sk->sk_data_ready;
-               goto out;
-       }
-
-       tc = conn->c_transport_data;
-       ready = tc->t_orig_data_ready;
-       rds_tcp_stats_inc(s_tcp_data_ready_calls);
-
-       if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
-               queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
-out:
-       read_unlock(&sk->sk_callback_lock);
-       ready(sk);
-}
-
-int rds_tcp_recv_init(void)
-{
-       rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
-                                       sizeof(struct rds_tcp_incoming),
-                                       0, 0, NULL);
-       if (!rds_tcp_incoming_slab)
-               return -ENOMEM;
-       return 0;
-}
-
-void rds_tcp_recv_exit(void)
-{
-       kmem_cache_destroy(rds_tcp_incoming_slab);
-}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
deleted file mode 100644 (file)
index 53b17ca..0000000
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-static void rds_tcp_cork(struct socket *sock, int val)
-{
-       mm_segment_t oldfs;
-
-       oldfs = get_fs();
-       set_fs(KERNEL_DS);
-       sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
-                             sizeof(val));
-       set_fs(oldfs);
-}
-
-void rds_tcp_xmit_prepare(struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-
-       rds_tcp_cork(tc->t_sock, 1);
-}
-
-void rds_tcp_xmit_complete(struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-
-       rds_tcp_cork(tc->t_sock, 0);
-}
-
-/* the core send_sem serializes this with other xmit and shutdown */
-static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
-{
-       struct kvec vec = {
-                .iov_base = data,
-                .iov_len = len,
-       };
-        struct msghdr msg = {
-                .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
-        };
-
-       return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
-}
-
-/* the core send_sem serializes this with other xmit and shutdown */
-int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
-                unsigned int hdr_off, unsigned int sg, unsigned int off)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       int done = 0;
-       int ret = 0;
-
-       if (hdr_off == 0) {
-               /*
-                * m_ack_seq is set to the sequence number of the last byte of
-                * header and data.  see rds_tcp_is_acked().
-                */
-               tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc);
-               rm->m_ack_seq = tc->t_last_sent_nxt +
-                               sizeof(struct rds_header) +
-                               be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
-               smp_mb__before_atomic();
-               set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
-               tc->t_last_expected_una = rm->m_ack_seq + 1;
-
-               rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
-                        rm, rds_tcp_snd_nxt(tc),
-                        (unsigned long long)rm->m_ack_seq);
-       }
-
-       if (hdr_off < sizeof(struct rds_header)) {
-               /* see rds_tcp_write_space() */
-               set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
-
-               ret = rds_tcp_sendmsg(tc->t_sock,
-                                     (void *)&rm->m_inc.i_hdr + hdr_off,
-                                     sizeof(rm->m_inc.i_hdr) - hdr_off);
-               if (ret < 0)
-                       goto out;
-               done += ret;
-               if (hdr_off + done != sizeof(struct rds_header))
-                       goto out;
-       }
-
-       while (sg < rm->data.op_nents) {
-               ret = tc->t_sock->ops->sendpage(tc->t_sock,
-                                               sg_page(&rm->data.op_sg[sg]),
-                                               rm->data.op_sg[sg].offset + off,
-                                               rm->data.op_sg[sg].length - off,
-                                               MSG_DONTWAIT|MSG_NOSIGNAL);
-               rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
-                        rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
-                        ret);
-               if (ret <= 0)
-                       break;
-
-               off += ret;
-               done += ret;
-               if (off == rm->data.op_sg[sg].length) {
-                       off = 0;
-                       sg++;
-               }
-       }
-
-out:
-       if (ret <= 0) {
-               /* write_space will hit after EAGAIN, all else fatal */
-               if (ret == -EAGAIN) {
-                       rds_tcp_stats_inc(s_tcp_sndbuf_full);
-                       ret = 0;
-               } else {
-                       printk(KERN_WARNING "RDS/tcp: send to %pI4 "
-                              "returned %d, disconnecting and reconnecting\n",
-                              &conn->c_faddr, ret);
-                       rds_conn_drop(conn);
-               }
-       }
-       if (done == 0)
-               done = ret;
-       return done;
-}
-
-/*
- * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
- * last byte of the message, including the header.  This means that the
- * entire message has been received if rm->m_ack_seq is "before" the next
- * unacked byte of the TCP sequence space.  We have to do very careful
- * wrapping 32bit comparisons here.
- */
-static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
-{
-       if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
-               return 0;
-       return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
-}
-
-void rds_tcp_write_space(struct sock *sk)
-{
-       void (*write_space)(struct sock *sk);
-       struct rds_connection *conn;
-       struct rds_tcp_connection *tc;
-
-       read_lock(&sk->sk_callback_lock);
-       conn = sk->sk_user_data;
-       if (!conn) {
-               write_space = sk->sk_write_space;
-               goto out;
-       }
-
-       tc = conn->c_transport_data;
-       rdsdebug("write_space for tc %p\n", tc);
-       write_space = tc->t_orig_write_space;
-       rds_tcp_stats_inc(s_tcp_write_space_calls);
-
-       rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
-       tc->t_last_seen_una = rds_tcp_snd_una(tc);
-       rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
-
-        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
-               queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
-out:
-       read_unlock(&sk->sk_callback_lock);
-
-       /*
-        * write_space is only called when data leaves tcp's send queue if
-        * SOCK_NOSPACE is set.  We set SOCK_NOSPACE every time we put
-        * data in tcp's send queue because we use write_space to parse the
-        * sequence numbers and notice that rds messages have been fully
-        * received.
-        *
-        * tcp's write_space clears SOCK_NOSPACE if the send queue has more
-        * than a certain amount of space. So we need to set it again *after*
-        * we call tcp's write_space or else we might only get called on the
-        * first of a series of incoming tcp acks.
-        */
-       write_space(sk);
-
-       if (sk->sk_socket)
-               set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c
deleted file mode 100644 (file)
index f8a7954..0000000
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/percpu.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
-       ____cacheline_aligned;
-
-static const char * const rds_tcp_stat_names[] = {
-       "tcp_data_ready_calls",
-       "tcp_write_space_calls",
-       "tcp_sndbuf_full",
-       "tcp_connect_raced",
-       "tcp_listen_closed_stale",
-};
-
-unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
-                                    unsigned int avail)
-{
-       struct rds_tcp_statistics stats = {0, };
-       uint64_t *src;
-       uint64_t *sum;
-       size_t i;
-       int cpu;
-
-       if (avail < ARRAY_SIZE(rds_tcp_stat_names))
-               goto out;
-
-       for_each_online_cpu(cpu) {
-               src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
-               sum = (uint64_t *)&stats;
-               for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
-                       *(sum++) += *(src++);
-       }
-
-       rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
-                           ARRAY_SIZE(rds_tcp_stat_names));
-out:
-       return ARRAY_SIZE(rds_tcp_stat_names);
-}
index dc2402e871fda52d10b0a67089c7c6e13832f72b..828a1bf9ea9236055956b9f4abdc4da80abc97d5 100644 (file)
@@ -32,7 +32,6 @@
  */
 #include <linux/kernel.h>
 #include <linux/random.h>
-#include <linux/export.h>
 
 #include "rds.h"
 
  *
  * Transition to state DISCONNECTING/DOWN:
  *  -  Inside the shutdown worker; synchronizes with xmit path
- *     through RDS_IN_XMIT, and with connection management callbacks
+ *     through c_send_lock, and with connection management callbacks
  *     via c_cm_lock.
  *
  *     For receive callbacks, we rely on the underlying transport
  *     (TCP, IB/RDMA) to provide the necessary synchronisation.
  */
 struct workqueue_struct *rds_wq;
-EXPORT_SYMBOL_GPL(rds_wq);
 
 void rds_connect_complete(struct rds_connection *conn)
 {
@@ -78,7 +76,8 @@ void rds_connect_complete(struct rds_connection *conn)
                                "current state is %d\n",
                                __func__,
                                atomic_read(&conn->c_state));
-               rds_conn_drop(conn);
+               atomic_set(&conn->c_state, RDS_CONN_ERROR);
+               queue_work(rds_wq, &conn->c_down_w);
                return;
        }
 
@@ -90,7 +89,6 @@ void rds_connect_complete(struct rds_connection *conn)
        queue_delayed_work(rds_wq, &conn->c_send_w, 0);
        queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
 }
-EXPORT_SYMBOL_GPL(rds_connect_complete);
 
 /*
  * This random exponential backoff is relied on to eventually resolve racing
@@ -110,7 +108,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
  * We should *always* start with a random backoff; otherwise a broken connection
  * will always take several iterations to be re-established.
  */
-void rds_queue_reconnect(struct rds_connection *conn)
+static void rds_queue_reconnect(struct rds_connection *conn)
 {
        unsigned long rand;
 
@@ -156,6 +154,58 @@ void rds_connect_worker(struct work_struct *work)
        }
 }
 
+void rds_shutdown_worker(struct work_struct *work)
+{
+       struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+       /* shut it down unless it's down already */
+       if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+               /*
+                * Quiesce the connection mgmt handlers before we start tearing
+                * things down. We don't hold the mutex for the entire
+                * duration of the shutdown operation, else we may be
+                * deadlocking with the CM handler. Instead, the CM event
+                * handler is supposed to check for state DISCONNECTING
+                */
+               mutex_lock(&conn->c_cm_lock);
+               if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+                && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+                       rds_conn_error(conn, "shutdown called in state %d\n",
+                                       atomic_read(&conn->c_state));
+                       mutex_unlock(&conn->c_cm_lock);
+                       return;
+               }
+               mutex_unlock(&conn->c_cm_lock);
+
+               mutex_lock(&conn->c_send_lock);
+               conn->c_trans->conn_shutdown(conn);
+               rds_conn_reset(conn);
+               mutex_unlock(&conn->c_send_lock);
+
+               if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+                       /* This can happen - eg when we're in the middle of tearing
+                        * down the connection, and someone unloads the rds module.
+                        * Quite reproduceable with loopback connections.
+                        * Mostly harmless.
+                        */
+                       rds_conn_error(conn,
+                               "%s: failed to transition to state DOWN, "
+                               "current state is %d\n",
+                               __func__,
+                               atomic_read(&conn->c_state));
+                       return;
+               }
+       }
+
+       /* Then reconnect if it's still live.
+        * The passive side of an IB loopback connection is never added
+        * to the conn hash, so we never trigger a reconnect on this
+        * conn - the reconnect is always triggered by the active peer. */
+       cancel_delayed_work(&conn->c_conn_w);
+       if (!hlist_unhashed(&conn->c_hash_node))
+               rds_queue_reconnect(conn);
+}
+
 void rds_send_worker(struct work_struct *work)
 {
        struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
@@ -200,22 +250,15 @@ void rds_recv_worker(struct work_struct *work)
        }
 }
 
-void rds_shutdown_worker(struct work_struct *work)
-{
-       struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
-
-       rds_conn_shutdown(conn);
-}
-
 void rds_threads_exit(void)
 {
        destroy_workqueue(rds_wq);
 }
 
-int rds_threads_init(void)
+int __init rds_threads_init(void)
 {
        rds_wq = create_singlethread_workqueue("krdsd");
-       if (!rds_wq)
+       if (rds_wq == NULL)
                return -ENOMEM;
 
        return 0;
index 7f2ac4fec3678b28715b95094c6346bcc49333e1..767da61ad2f397c577cd5ea7a81fb91392b1bf73 100644 (file)
@@ -37,7 +37,7 @@
 #include "rds.h"
 #include "loop.h"
 
-static struct rds_transport *transports[RDS_TRANS_COUNT];
+static LIST_HEAD(rds_transports);
 static DECLARE_RWSEM(rds_trans_sem);
 
 int rds_trans_register(struct rds_transport *trans)
@@ -46,52 +46,35 @@ int rds_trans_register(struct rds_transport *trans)
 
        down_write(&rds_trans_sem);
 
-       if (transports[trans->t_type])
-               printk(KERN_ERR "RDS Transport type %d already registered\n",
-                       trans->t_type);
-       else {
-               transports[trans->t_type] = trans;
-               printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
-       }
+       list_add_tail(&trans->t_item, &rds_transports);
+       printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
 
        up_write(&rds_trans_sem);
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(rds_trans_register);
 
 void rds_trans_unregister(struct rds_transport *trans)
 {
        down_write(&rds_trans_sem);
 
-       transports[trans->t_type] = NULL;
+       list_del_init(&trans->t_item);
        printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
 
        up_write(&rds_trans_sem);
 }
-EXPORT_SYMBOL_GPL(rds_trans_unregister);
-
-void rds_trans_put(struct rds_transport *trans)
-{
-       if (trans && trans->t_owner)
-               module_put(trans->t_owner);
-}
 
 struct rds_transport *rds_trans_get_preferred(__be32 addr)
 {
-       struct rds_transport *ret = NULL;
        struct rds_transport *trans;
-       unsigned int i;
+       struct rds_transport *ret = NULL;
 
        if (IN_LOOPBACK(ntohl(addr)))
                return &rds_loop_transport;
 
        down_read(&rds_trans_sem);
-       for (i = 0; i < RDS_TRANS_COUNT; i++) {
-               trans = transports[i];
-
-               if (trans && (trans->laddr_check(addr) == 0) &&
-                   (!trans->t_owner || try_module_get(trans->t_owner))) {
+       list_for_each_entry(trans, &rds_transports, t_item) {
+               if (trans->laddr_check(addr) == 0) {
                        ret = trans;
                        break;
                }
@@ -114,15 +97,12 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
        struct rds_transport *trans;
        unsigned int total = 0;
        unsigned int part;
-       int i;
 
        rds_info_iter_unmap(iter);
        down_read(&rds_trans_sem);
 
-       for (i = 0; i < RDS_TRANS_COUNT; i++)
-       {
-               trans = transports[i];
-               if (!trans || !trans->stats_info_copy)
+       list_for_each_entry(trans, &rds_transports, t_item) {
+               if (trans->stats_info_copy == NULL)
                        continue;
 
                part = trans->stats_info_copy(iter, avail);