#include <linux/times.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sock_diag.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
#ifdef CONFIG_BPF_SYSCALL
union bpf_tcp_iter_batch_item {
struct sock *sk;
+ __u64 cookie;
};
struct bpf_tcp_iter_state {
static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
{
+ union bpf_tcp_iter_batch_item *item;
unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
- while (cur_sk < iter->end_sk)
- sock_gen_put(iter->batch[cur_sk++].sk);
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_gen_put(item->sk);
+ item->cookie = cookie;
+ }
}
static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
return 0;
}
+static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
+ union bpf_tcp_iter_batch_item *cookies,
+ int n_cookies)
+{
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ sk_nulls_for_each_from(sk, node)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ return sk;
+ }
+
+ return NULL;
+}
+
+static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = listening_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
+ ++st->bucket;
+ sk = listening_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = established_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+ ++st->bucket;
+ sk = established_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct sock *sk = NULL;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_LISTENING:
+ sk = bpf_iter_tcp_resume_listening(seq);
+ if (sk)
+ break;
+ st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ fallthrough;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ sk = bpf_iter_tcp_resume_established(seq);
+ break;
+ }
+
+ return sk;
+}
+
static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
struct sock **start_sk)
{
static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
{
- struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct bpf_tcp_iter_state *iter = seq->private;
- struct tcp_iter_state *st = &iter->state;
unsigned int expected;
struct sock *sk;
int err;
- /* The st->bucket is done. Directly advance to the next
- * bucket instead of having the tcp_seek_last_pos() to skip
- * one by one in the current bucket and eventually find out
- * it has to advance to the next bucket.
- */
- if (iter->end_sk && iter->cur_sk == iter->end_sk) {
- st->offset = 0;
- st->bucket++;
- if (st->state == TCP_SEQ_STATE_LISTENING &&
- st->bucket > hinfo->lhash2_mask) {
- st->state = TCP_SEQ_STATE_ESTABLISHED;
- st->bucket = 0;
- }
- }
-
- iter->cur_sk = 0;
- iter->end_sk = 0;
-
- sk = tcp_seek_last_pos(seq);
+ sk = bpf_iter_tcp_resume(seq);
if (!sk)
return NULL; /* Done */
if (err)
return ERR_PTR(err);
- iter->cur_sk = 0;
- iter->end_sk = 0;
-
- sk = tcp_seek_last_pos(seq);
+ sk = bpf_iter_tcp_resume(seq);
if (!sk)
return NULL; /* Done */
* meta.seq_num is used instead.
*/
st->num++;
- /* Move st->offset to the next sk in the bucket such that
- * the future start() will resume at st->offset in
- * st->bucket. See tcp_seek_last_pos().
- */
- st->offset++;
sock_gen_put(iter->batch[iter->cur_sk++].sk);
}