From b219bcfcc92e9bd50c6277ac68cb75f64b403e5e Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:55 +0100 Subject: [PATCH 01/16] selftests: net: lib: Move logging from forwarding/lib.sh here Many net selftests invent their own logging helpers. These really should be in a library sourced by these tests. Currently forwarding/lib.sh has a suite of perfectly fine logging helpers, but sourcing a forwarding/ library from a higher-level directory smells of layering violation. In this patch, move the logging helpers to net/lib.sh so that every net test can use them. Together with the logging helpers, it's also necessary to move pause_on_fail(), and EXIT_STATUS and RET. Existing lib.sh users might be using these same names for their functions or variables. However lib.sh is always sourced near the top of the file (checked), and whatever new definitions will simply override the ones provided by lib.sh. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Acked-by: Shuah Khan Link: https://patch.msgid.link/edd3785a3bd72ffbe1409300989e993ee50ae98b.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 113 ----------------- tools/testing/selftests/net/lib.sh | 115 ++++++++++++++++++ 2 files changed, 115 insertions(+), 113 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 89c25f72b10c..41dd14c42c48 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -48,7 +48,6 @@ declare -A NETIFS=( : "${WAIT_TIME:=5}" # Whether to pause on, respectively, after a failure and before cleanup. -: "${PAUSE_ON_FAIL:=no}" : "${PAUSE_ON_CLEANUP:=no}" # Whether to create virtual interfaces, and what netdevice type they should be. @@ -446,22 +445,6 @@ done ############################################################################## # Helpers -# Exit status to return at the end. Set in case one of the tests fails. -EXIT_STATUS=0 -# Per-test return value. Clear at the beginning of each test. -RET=0 - -ret_set_ksft_status() -{ - local ksft_status=$1; shift - local msg=$1; shift - - RET=$(ksft_status_merge $RET $ksft_status) - if (( $? )); then - retmsg=$msg - fi -} - # Whether FAILs should be interpreted as XFAILs. Internal. FAIL_TO_XFAIL= @@ -535,102 +518,6 @@ xfail_on_veth() fi } -log_test_result() -{ - local test_name=$1; shift - local opt_str=$1; shift - local result=$1; shift - local retmsg=$1; shift - - printf "TEST: %-60s [%s]\n" "$test_name $opt_str" "$result" - if [[ $retmsg ]]; then - printf "\t%s\n" "$retmsg" - fi -} - -pause_on_fail() -{ - if [[ $PAUSE_ON_FAIL == yes ]]; then - echo "Hit enter to continue, 'q' to quit" - read a - [[ $a == q ]] && exit 1 - fi -} - -handle_test_result_pass() -{ - local test_name=$1; shift - local opt_str=$1; shift - - log_test_result "$test_name" "$opt_str" " OK " -} - -handle_test_result_fail() -{ - local test_name=$1; shift - local opt_str=$1; shift - - log_test_result "$test_name" "$opt_str" FAIL "$retmsg" - pause_on_fail -} - -handle_test_result_xfail() -{ - local test_name=$1; shift - local opt_str=$1; shift - - log_test_result "$test_name" "$opt_str" XFAIL "$retmsg" - pause_on_fail -} - -handle_test_result_skip() -{ - local test_name=$1; shift - local opt_str=$1; shift - - log_test_result "$test_name" "$opt_str" SKIP "$retmsg" -} - -log_test() -{ - local test_name=$1 - local opt_str=$2 - - if [[ $# -eq 2 ]]; then - opt_str="($opt_str)" - fi - - if ((RET == ksft_pass)); then - handle_test_result_pass "$test_name" "$opt_str" - elif ((RET == ksft_xfail)); then - handle_test_result_xfail "$test_name" "$opt_str" - elif ((RET == ksft_skip)); then - handle_test_result_skip "$test_name" "$opt_str" - else - handle_test_result_fail "$test_name" "$opt_str" - fi - - EXIT_STATUS=$(ksft_exit_status_merge $EXIT_STATUS $RET) - return $RET -} - -log_test_skip() -{ - RET=$ksft_skip retmsg= log_test "$@" -} - -log_test_xfail() -{ - RET=$ksft_xfail retmsg= log_test "$@" -} - -log_info() -{ - local msg=$1 - - echo "INFO: $msg" -} - not() { "$@" diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index c8991cc6bf28..691318b1ec55 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -9,6 +9,9 @@ source "$net_dir/lib/sh/defer.sh" : "${WAIT_TIMEOUT:=20}" +# Whether to pause on after a failure. +: "${PAUSE_ON_FAIL:=no}" + BUSYWAIT_TIMEOUT=$((WAIT_TIMEOUT * 1000)) # ms # Kselftest framework constants. @@ -20,6 +23,11 @@ ksft_skip=4 # namespace list created by setup_ns NS_LIST=() +# Exit status to return at the end. Set in case one of the tests fails. +EXIT_STATUS=0 +# Per-test return value. Clear at the beginning of each test. +RET=0 + ############################################################################## # Helpers @@ -236,3 +244,110 @@ tc_rule_handle_stats_get() | jq ".[] | select(.options.handle == $handle) | \ .options.actions[0].stats$selector" } + +ret_set_ksft_status() +{ + local ksft_status=$1; shift + local msg=$1; shift + + RET=$(ksft_status_merge $RET $ksft_status) + if (( $? )); then + retmsg=$msg + fi +} + +log_test_result() +{ + local test_name=$1; shift + local opt_str=$1; shift + local result=$1; shift + local retmsg=$1; shift + + printf "TEST: %-60s [%s]\n" "$test_name $opt_str" "$result" + if [[ $retmsg ]]; then + printf "\t%s\n" "$retmsg" + fi +} + +pause_on_fail() +{ + if [[ $PAUSE_ON_FAIL == yes ]]; then + echo "Hit enter to continue, 'q' to quit" + read a + [[ $a == q ]] && exit 1 + fi +} + +handle_test_result_pass() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" " OK " +} + +handle_test_result_fail() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" FAIL "$retmsg" + pause_on_fail +} + +handle_test_result_xfail() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" XFAIL "$retmsg" + pause_on_fail +} + +handle_test_result_skip() +{ + local test_name=$1; shift + local opt_str=$1; shift + + log_test_result "$test_name" "$opt_str" SKIP "$retmsg" +} + +log_test() +{ + local test_name=$1 + local opt_str=$2 + + if [[ $# -eq 2 ]]; then + opt_str="($opt_str)" + fi + + if ((RET == ksft_pass)); then + handle_test_result_pass "$test_name" "$opt_str" + elif ((RET == ksft_xfail)); then + handle_test_result_xfail "$test_name" "$opt_str" + elif ((RET == ksft_skip)); then + handle_test_result_skip "$test_name" "$opt_str" + else + handle_test_result_fail "$test_name" "$opt_str" + fi + + EXIT_STATUS=$(ksft_exit_status_merge $EXIT_STATUS $RET) + return $RET +} + +log_test_skip() +{ + RET=$ksft_skip retmsg= log_test "$@" +} + +log_test_xfail() +{ + RET=$ksft_xfail retmsg= log_test "$@" +} + +log_info() +{ + local msg=$1 + + echo "INFO: $msg" +} -- 2.51.0 From 601d9d70a40a8ccf93f41a153dd4c9aa1db60d57 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:56 +0100 Subject: [PATCH 02/16] selftests: net: lib: Move tests_run from forwarding/lib.sh here It would be good to use the same mechanism for scheduling and dispatching general net tests as the many forwarding tests already use. To that end, move the logging helpers to net/lib.sh so that every net test can use them. Existing lib.sh users might be using the name themselves. However lib.sh is always sourced near the top of the file (checked), and whatever new definition will simply override the one provided by lib.sh. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Acked-by: Shuah Khan Link: https://patch.msgid.link/a6fc083486493425b2c61185c327845b6ce3233a.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 10 ---------- tools/testing/selftests/net/lib.sh | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 41dd14c42c48..d28dbf27c1f0 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1285,16 +1285,6 @@ matchall_sink_create() action drop } -tests_run() -{ - local current_test - - for current_test in ${TESTS:-$ALL_TESTS}; do - in_defer_scope \ - $current_test - done -} - cleanup() { pre_cleanup diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 691318b1ec55..4f52b8e48a3a 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -351,3 +351,13 @@ log_info() echo "INFO: $msg" } + +tests_run() +{ + local current_test + + for current_test in ${TESTS:-$ALL_TESTS}; do + in_defer_scope \ + $current_test + done +} -- 2.51.0 From af76b4431818cf7a73cf0ec19465ad3b01cdb159 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:57 +0100 Subject: [PATCH 03/16] selftests: net: lib: Move checks from forwarding/lib.sh here For logging to be useful, something has to set RET and retmsg by calling ret_set_ksft_status(). There is a suite of functions to that end in forwarding/lib: check_err, check_fail et.al. Move them to net/lib.sh so that every net test can use them. Existing lib.sh users might be using these same names for their functions. However lib.sh is always sourced near the top of the file (checked), and whatever new definitions will simply override the ones provided by lib.sh. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Acked-by: Shuah Khan Link: https://patch.msgid.link/f488a00dc85b8e0c1f3c71476b32b21b5189a847.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 73 ------------------- tools/testing/selftests/net/lib.sh | 73 +++++++++++++++++++ 2 files changed, 73 insertions(+), 73 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index d28dbf27c1f0..8625e3c99f55 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -445,79 +445,6 @@ done ############################################################################## # Helpers -# Whether FAILs should be interpreted as XFAILs. Internal. -FAIL_TO_XFAIL= - -check_err() -{ - local err=$1 - local msg=$2 - - if ((err)); then - if [[ $FAIL_TO_XFAIL = yes ]]; then - ret_set_ksft_status $ksft_xfail "$msg" - else - ret_set_ksft_status $ksft_fail "$msg" - fi - fi -} - -check_fail() -{ - local err=$1 - local msg=$2 - - check_err $((!err)) "$msg" -} - -check_err_fail() -{ - local should_fail=$1; shift - local err=$1; shift - local what=$1; shift - - if ((should_fail)); then - check_fail $err "$what succeeded, but should have failed" - else - check_err $err "$what failed" - fi -} - -xfail() -{ - FAIL_TO_XFAIL=yes "$@" -} - -xfail_on_slow() -{ - if [[ $KSFT_MACHINE_SLOW = yes ]]; then - FAIL_TO_XFAIL=yes "$@" - else - "$@" - fi -} - -omit_on_slow() -{ - if [[ $KSFT_MACHINE_SLOW != yes ]]; then - "$@" - fi -} - -xfail_on_veth() -{ - local dev=$1; shift - local kind - - kind=$(ip -j -d link show dev $dev | - jq -r '.[].linkinfo.info_kind') - if [[ $kind = veth ]]; then - FAIL_TO_XFAIL=yes "$@" - else - "$@" - fi -} - not() { "$@" diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 4f52b8e48a3a..6bcf5d13879d 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -361,3 +361,76 @@ tests_run() $current_test done } + +# Whether FAILs should be interpreted as XFAILs. Internal. +FAIL_TO_XFAIL= + +check_err() +{ + local err=$1 + local msg=$2 + + if ((err)); then + if [[ $FAIL_TO_XFAIL = yes ]]; then + ret_set_ksft_status $ksft_xfail "$msg" + else + ret_set_ksft_status $ksft_fail "$msg" + fi + fi +} + +check_fail() +{ + local err=$1 + local msg=$2 + + check_err $((!err)) "$msg" +} + +check_err_fail() +{ + local should_fail=$1; shift + local err=$1; shift + local what=$1; shift + + if ((should_fail)); then + check_fail $err "$what succeeded, but should have failed" + else + check_err $err "$what failed" + fi +} + +xfail() +{ + FAIL_TO_XFAIL=yes "$@" +} + +xfail_on_slow() +{ + if [[ $KSFT_MACHINE_SLOW = yes ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} + +omit_on_slow() +{ + if [[ $KSFT_MACHINE_SLOW != yes ]]; then + "$@" + fi +} + +xfail_on_veth() +{ + local dev=$1; shift + local kind + + kind=$(ip -j -d link show dev $dev | + jq -r '.[].linkinfo.info_kind') + if [[ $kind = veth ]]; then + FAIL_TO_XFAIL=yes "$@" + else + "$@" + fi +} -- 2.51.0 From 46f6569cf0754e27816403c3701c7070ff281ad0 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:58 +0100 Subject: [PATCH 04/16] selftests: net: lib: Add kill_process A number of selftests run processes in the background and need to kill them afterwards. Instead for everyone to open-code the kill / wait / redirect mantra, add a helper in net/lib.sh. Convert existing open-code sites. Signed-off-by: Petr Machata Acked-by: Shuah Khan Reviewed-by: Amit Cohen Link: https://patch.msgid.link/a9db102067d741c118f0bd93b10c75e2a34665ea.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/mlxsw/devlink_trap.sh | 2 +- .../drivers/net/mlxsw/devlink_trap_l3_drops.sh | 4 ++-- .../drivers/net/mlxsw/devlink_trap_l3_exceptions.sh | 12 ++++++------ .../drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh | 4 ++-- .../drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh | 4 ++-- .../drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh | 4 ++-- .../net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh | 4 ++-- .../testing/selftests/drivers/net/mlxsw/tc_sample.sh | 4 ++-- .../drivers/net/netdevsim/fib_notifications.sh | 6 +++--- tools/testing/selftests/net/drop_monitor_tests.sh | 2 +- tools/testing/selftests/net/fib_tests.sh | 8 ++++---- .../testing/selftests/net/forwarding/devlink_lib.sh | 2 +- tools/testing/selftests/net/forwarding/lib.sh | 3 +-- tools/testing/selftests/net/forwarding/tc_police.sh | 8 ++++---- tools/testing/selftests/net/lib.sh | 8 ++++++++ 15 files changed, 41 insertions(+), 34 deletions(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh index 89b55e946eed..36055279ba92 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh @@ -116,7 +116,7 @@ dev_del_test() log_test "Device delete" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid } trap cleanup EXIT diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh index 160891dcb4bc..db5806d189bb 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh @@ -595,7 +595,7 @@ irif_disabled_test() log_test "Ingress RIF disabled" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid ip link set dev $rp1 nomaster __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 ip link del dev br0 type bridge @@ -645,7 +645,7 @@ erif_disabled_test() log_test "Egress RIF disabled" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64 ip link del dev br0 type bridge devlink_trap_action_set $trap_name "drop" diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh index 190c1b6b5365..5d6d88b600f0 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh @@ -202,7 +202,7 @@ mtu_value_is_too_small_test() mtu_restore $rp2 - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $h1 ingress protocol ip pref 1 handle 101 flower } @@ -235,7 +235,7 @@ __ttl_value_is_too_small_test() log_test "TTL value is too small: TTL=$ttl_val" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $h1 ingress protocol ip pref 1 handle 101 flower } @@ -299,7 +299,7 @@ __mc_reverse_path_forwarding_test() log_test "Multicast reverse path forwarding: $desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $rp2 egress protocol $proto pref 1 handle 101 flower } @@ -347,7 +347,7 @@ __reject_route_test() log_test "Reject route: $desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid ip route del unreachable $unreachable tc filter del dev $h1 ingress protocol $proto pref 1 handle 101 flower } @@ -542,7 +542,7 @@ ipv4_lpm_miss_test() log_test "LPM miss: IPv4" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid vrf_without_routes_destroy } @@ -569,7 +569,7 @@ ipv6_lpm_miss_test() log_test "LPM miss: IPv6" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid vrf_without_routes_destroy } diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh index e9a82cae8c9a..4ac1dae92d0f 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh @@ -176,7 +176,7 @@ ecn_decap_test() log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower } @@ -207,7 +207,7 @@ no_matching_tunnel_test() log_test "$desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower } diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh index 878125041fc3..fce885184404 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip6.sh @@ -176,7 +176,7 @@ ecn_decap_test() log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower } @@ -207,7 +207,7 @@ no_matching_tunnel_test() log_test "$desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower } diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh index 5f6eb965cfd1..7aca8e5922cf 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh @@ -183,7 +183,7 @@ ecn_decap_test() log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower } @@ -253,7 +253,7 @@ corrupted_packet_test() log_test "$desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower } diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh index f6c16cbb6cf7..4599c331240b 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan_ipv6.sh @@ -188,7 +188,7 @@ ecn_decap_test() log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower } @@ -262,7 +262,7 @@ corrupted_packet_test() log_test "$desc" - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $swp1 egress protocol ipv6 pref 1 handle 101 flower } diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh index 83a0210e7544..bc7ea2df49fb 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh @@ -218,7 +218,7 @@ psample_capture_start() psample_capture_stop() { - { kill %% && wait %%; } 2>/dev/null + kill_process %% } __tc_sample_rate_test() @@ -499,7 +499,7 @@ tc_sample_md_out_tc_occ_test() backlog=$(tc -j -p -s qdisc show dev $rp2 | jq '.[0]["backlog"]') # Kill mausezahn. - { kill %% && wait %%; } 2>/dev/null + kill_process %% psample_capture_stop diff --git a/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh b/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh index 8d91191a098c..9896580c3d85 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/fib_notifications.sh @@ -94,7 +94,7 @@ route_addition_check() sleep 1 $IP route add $route dev dummy1 sleep 1 - kill %% && wait %% &> /dev/null + kill_process %% route_notify_check $outfile $expected_num_notifications $offload_failed rm -f $outfile @@ -148,7 +148,7 @@ route_deletion_check() sleep 1 $IP route del $route dev dummy1 sleep 1 - kill %% && wait %% &> /dev/null + kill_process %% route_notify_check $outfile $expected_num_notifications rm -f $outfile @@ -191,7 +191,7 @@ route_replacement_check() sleep 1 $IP route replace $route dev dummy2 sleep 1 - kill %% && wait %% &> /dev/null + kill_process %% route_notify_check $outfile $expected_num_notifications rm -f $outfile diff --git a/tools/testing/selftests/net/drop_monitor_tests.sh b/tools/testing/selftests/net/drop_monitor_tests.sh index 7c4818c971fc..507d0a82f5f0 100755 --- a/tools/testing/selftests/net/drop_monitor_tests.sh +++ b/tools/testing/selftests/net/drop_monitor_tests.sh @@ -77,7 +77,7 @@ sw_drops_test() rm ${dir}/packets.pcap - { kill %% && wait %%; } 2>/dev/null + kill_process %% timeout 5 dwdump -o sw -w ${dir}/packets.pcap (( $(tshark -r ${dir}/packets.pcap \ -Y 'ip.dst == 192.0.2.10' 2> /dev/null | wc -l) == 0)) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 5f3c28fc8624..3ea6f886a210 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -689,7 +689,7 @@ fib6_notify_test() log_test $ret 0 "ipv6 route add notify" - { kill %% && wait %%; } 2>/dev/null + kill_process %% #rm errors.txt @@ -736,7 +736,7 @@ fib_notify_test() log_test $ret 0 "ipv4 route add notify" - { kill %% && wait %%; } 2>/dev/null + kill_process %% rm errors.txt @@ -2328,7 +2328,7 @@ ipv4_mangle_test() $IP route del table 123 172.16.101.0/24 dev veth1 $IP rule del pref 100 - { kill %% && wait %%; } 2>/dev/null + kill_process %% rm $tmp_file route_cleanup @@ -2386,7 +2386,7 @@ ipv6_mangle_test() $IP -6 route del table 123 2001:db8:101::/64 dev veth1 $IP -6 rule del pref 100 - { kill %% && wait %%; } 2>/dev/null + kill_process %% rm $tmp_file route_cleanup diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh index 62a05bca1e82..18afa89ebbcc 100644 --- a/tools/testing/selftests/net/forwarding/devlink_lib.sh +++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh @@ -501,7 +501,7 @@ devlink_trap_drop_cleanup() local pref=$1; shift local handle=$1; shift - kill $mz_pid && wait $mz_pid &> /dev/null + kill_process $mz_pid tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower } diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 8625e3c99f55..7337f398f9cc 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1574,8 +1574,7 @@ stop_traffic() { local pid=${1-%%}; shift - # Suppress noise from killing mausezahn. - { kill $pid && wait $pid; } 2>/dev/null + kill_process "$pid" } declare -A cappid diff --git a/tools/testing/selftests/net/forwarding/tc_police.sh b/tools/testing/selftests/net/forwarding/tc_police.sh index 5103f64a71d6..509fdedfcfa1 100755 --- a/tools/testing/selftests/net/forwarding/tc_police.sh +++ b/tools/testing/selftests/net/forwarding/tc_police.sh @@ -148,7 +148,7 @@ police_common_test() log_test "$test_name" - { kill %% && wait %%; } 2>/dev/null + kill_process %% tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower } @@ -198,7 +198,7 @@ police_shared_common_test() log_test "$test_name" - { kill %% && wait %%; } 2>/dev/null + kill_process %% } police_shared_test() @@ -278,7 +278,7 @@ police_mirror_common_test() log_test "$test_name" - { kill %% && wait %%; } 2>/dev/null + kill_process %% tc filter del dev $pol_if $dir protocol ip pref 1 handle 101 flower tc filter del dev $h3 ingress protocol ip pref 1 handle 101 flower tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower @@ -320,7 +320,7 @@ police_pps_common_test() log_test "$test_name" - { kill %% && wait %%; } 2>/dev/null + kill_process %% tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower } diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 6bcf5d13879d..24f63e45735d 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -434,3 +434,11 @@ xfail_on_veth() "$@" fi } + +kill_process() +{ + local pid=$1; shift + + # Suppress noise from killing the process. + { kill $pid && wait $pid; } 2>/dev/null +} -- 2.51.0 From 15880bec9bc32ddc8f70f8c551745c2344233372 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:59 +0100 Subject: [PATCH 05/16] selftests: net: fdb_notify: Add a test for FDB notifications Check that only one notification is produced for various FDB edit operations. Regarding the ip_link_add() and ip_link_master() helpers. This pattern of action plus corresponding defer is bound to come up often, and a dedicated vocabulary to capture it will be handy. tunnel_create() and vlan_create() from forwarding/lib.sh are somewhat opaque and perhaps too kitchen-sinky, so I tried to go in the opposite direction with these ones, and wrapped only the bare minimum to schedule a corresponding cleanup. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Acked-by: Shuah Khan Link: https://patch.msgid.link/910c5880ae6d3b558d6889cbdba2be690c2615c6.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/fdb_notify.sh | 96 +++++++++++++++++++++++ tools/testing/selftests/net/lib.sh | 17 ++++ 3 files changed, 114 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/fdb_notify.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index d323898c466c..3d487b03c4a0 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -93,7 +93,7 @@ TEST_PROGS += test_vxlan_mdb.sh TEST_PROGS += test_bridge_neigh_suppress.sh TEST_PROGS += test_vxlan_nolocalbypass.sh TEST_PROGS += test_bridge_backup_port.sh -TEST_PROGS += fdb_flush.sh +TEST_PROGS += fdb_flush.sh fdb_notify.sh TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh TEST_PROGS += bpf_offload.py diff --git a/tools/testing/selftests/net/fdb_notify.sh b/tools/testing/selftests/net/fdb_notify.sh new file mode 100755 index 000000000000..c03151e7791c --- /dev/null +++ b/tools/testing/selftests/net/fdb_notify.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +ALL_TESTS=" + test_dup_bridge + test_dup_vxlan_self + test_dup_vxlan_master + test_dup_macvlan_self + test_dup_macvlan_master +" + +do_test_dup() +{ + local op=$1; shift + local what=$1; shift + local tmpf + + RET=0 + + tmpf=$(mktemp) + defer rm "$tmpf" + + defer_scope_push + bridge monitor fdb &> "$tmpf" & + defer kill_process $! + + sleep 0.5 + bridge fdb "$op" 00:11:22:33:44:55 vlan 1 "$@" + sleep 0.5 + defer_scope_pop + + local count=$(grep -c -e 00:11:22:33:44:55 $tmpf) + ((count == 1)) + check_err $? "Got $count notifications, expected 1" + + log_test "$what $op: Duplicate notifications" +} + +test_dup_bridge() +{ + ip_link_add br up type bridge vlan_filtering 1 + do_test_dup add "bridge" dev br self + do_test_dup del "bridge" dev br self +} + +test_dup_vxlan_self() +{ + ip_link_add br up type bridge vlan_filtering 1 + ip_link_add vx up type vxlan id 2000 dstport 4789 + ip_link_master vx br + + do_test_dup add "vxlan" dev vx self dst 192.0.2.1 + do_test_dup del "vxlan" dev vx self dst 192.0.2.1 +} + +test_dup_vxlan_master() +{ + ip_link_add br up type bridge vlan_filtering 1 + ip_link_add vx up type vxlan id 2000 dstport 4789 + ip_link_master vx br + + do_test_dup add "vxlan master" dev vx master + do_test_dup del "vxlan master" dev vx master +} + +test_dup_macvlan_self() +{ + ip_link_add dd up type dummy + ip_link_add mv up link dd type macvlan mode passthru + + do_test_dup add "macvlan self" dev mv self + do_test_dup del "macvlan self" dev mv self +} + +test_dup_macvlan_master() +{ + ip_link_add br up type bridge vlan_filtering 1 + ip_link_add dd up type dummy + ip_link_add mv up link dd type macvlan mode passthru + ip_link_master mv br + + do_test_dup add "macvlan master" dev mv self + do_test_dup del "macvlan master" dev mv self +} + +cleanup() +{ + defer_scopes_cleanup +} + +trap cleanup EXIT +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 24f63e45735d..8994fec1c38f 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -442,3 +442,20 @@ kill_process() # Suppress noise from killing the process. { kill $pid && wait $pid; } 2>/dev/null } + +ip_link_add() +{ + local name=$1; shift + + ip link add name "$name" "$@" + defer ip link del dev "$name" +} + +ip_link_master() +{ + local member=$1; shift + local master=$1; shift + + ip link set dev "$member" master "$master" + defer ip link set dev "$member" nomaster +} -- 2.51.0 From 9f19c084057abfb8e4676e6e91866bfa5a6a5577 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:16 +0800 Subject: [PATCH 06/16] virtio_ring: introduce vring_need_unmap_buffer To make the code readable, introduce vring_need_unmap_buffer() to replace do_unmap. use_dma_api premapped -> vring_need_unmap_buffer() 1. false false false 2. true false true 3. true true false Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-2-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/virtio/virtio_ring.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 98374ed7c577..97590c201aa2 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -175,11 +175,6 @@ struct vring_virtqueue { /* Do DMA mapping by driver */ bool premapped; - /* Do unmap or not for desc. Just when premapped is False and - * use_dma_api is true, this is true. - */ - bool do_unmap; - /* Head of free buffer list. */ unsigned int free_head; /* Number we've added since last sync. */ @@ -297,6 +292,11 @@ static bool vring_use_dma_api(const struct virtio_device *vdev) return false; } +static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring) +{ + return vring->use_dma_api && !vring->premapped; +} + size_t virtio_max_dma_size(const struct virtio_device *vdev) { size_t max_segment_size = SIZE_MAX; @@ -445,7 +445,7 @@ static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, { u16 flags; - if (!vq->do_unmap) + if (!vring_need_unmap_buffer(vq)) return; flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); @@ -475,7 +475,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { - if (!vq->do_unmap) + if (!vring_need_unmap_buffer(vq)) goto out; dma_unmap_page(vring_dma_dev(vq), @@ -643,7 +643,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, } /* Last one doesn't continue. */ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); - if (!indirect && vq->do_unmap) + if (!indirect && vring_need_unmap_buffer(vq)) vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= ~VRING_DESC_F_NEXT; @@ -802,7 +802,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, VRING_DESC_F_INDIRECT)); BUG_ON(len == 0 || len % sizeof(struct vring_desc)); - if (vq->do_unmap) { + if (vring_need_unmap_buffer(vq)) { for (j = 0; j < len / sizeof(struct vring_desc); j++) vring_unmap_one_split_indirect(vq, &indir_desc[j]); } @@ -1236,7 +1236,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { - if (!vq->do_unmap) + if (!vring_need_unmap_buffer(vq)) return; dma_unmap_page(vring_dma_dev(vq), @@ -1251,7 +1251,7 @@ static void vring_unmap_desc_packed(const struct vring_virtqueue *vq, { u16 flags; - if (!vq->do_unmap) + if (!vring_need_unmap_buffer(vq)) return; flags = le16_to_cpu(desc->flags); @@ -1632,7 +1632,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, if (!desc) return; - if (vq->do_unmap) { + if (vring_need_unmap_buffer(vq)) { len = vq->packed.desc_extra[id].len; for (i = 0; i < len / sizeof(struct vring_packed_desc); i++) @@ -2091,7 +2091,6 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); vq->premapped = false; - vq->do_unmap = vq->use_dma_api; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2636,7 +2635,6 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); vq->premapped = false; - vq->do_unmap = vq->use_dma_api; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2799,7 +2797,6 @@ int virtqueue_set_dma_premapped(struct virtqueue *_vq) } vq->premapped = true; - vq->do_unmap = false; END_USE(vq); -- 2.51.0 From bc2b4c3401c6acaba6c35837c415874cff91b124 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:17 +0800 Subject: [PATCH 07/16] virtio_ring: split: record extras for indirect buffers The subsequent commit needs to know whether every indirect buffer is premapped or not. So we need to introduce an extra struct for every indirect buffer to record this info. Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-3-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/virtio/virtio_ring.c | 112 ++++++++++++++++------------------- 1 file changed, 52 insertions(+), 60 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 97590c201aa2..405d5a348795 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -69,7 +69,11 @@ struct vring_desc_state_split { void *data; /* Data for callback. */ - struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ + + /* Indirect desc table and extra table, if any. These two will be + * allocated together. So we won't stress more to the memory allocator. + */ + struct vring_desc *indir_desc; }; struct vring_desc_state_packed { @@ -440,38 +444,20 @@ static void virtqueue_init(struct vring_virtqueue *vq, u32 num) * Split ring specific functions - *_split(). */ -static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, - const struct vring_desc *desc) -{ - u16 flags; - - if (!vring_need_unmap_buffer(vq)) - return; - - flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); - - dma_unmap_page(vring_dma_dev(vq), - virtio64_to_cpu(vq->vq.vdev, desc->addr), - virtio32_to_cpu(vq->vq.vdev, desc->len), - (flags & VRING_DESC_F_WRITE) ? - DMA_FROM_DEVICE : DMA_TO_DEVICE); -} - static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, - unsigned int i) + struct vring_desc_extra *extra) { - struct vring_desc_extra *extra = vq->split.desc_extra; u16 flags; - flags = extra[i].flags; + flags = extra->flags; if (flags & VRING_DESC_F_INDIRECT) { if (!vq->use_dma_api) goto out; dma_unmap_single(vring_dma_dev(vq), - extra[i].addr, - extra[i].len, + extra->addr, + extra->len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { @@ -479,22 +465,23 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, goto out; dma_unmap_page(vring_dma_dev(vq), - extra[i].addr, - extra[i].len, + extra->addr, + extra->len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } out: - return extra[i].next; + return extra->next; } static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, unsigned int total_sg, gfp_t gfp) { + struct vring_desc_extra *extra; struct vring_desc *desc; - unsigned int i; + unsigned int i, size; /* * We require lowmem mappings for the descriptors because @@ -503,40 +490,41 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, */ gfp &= ~__GFP_HIGHMEM; - desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp); + size = sizeof(*desc) * total_sg + sizeof(*extra) * total_sg; + + desc = kmalloc(size, gfp); if (!desc) return NULL; + extra = (struct vring_desc_extra *)&desc[total_sg]; + for (i = 0; i < total_sg; i++) - desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1); + extra[i].next = i + 1; + return desc; } static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, struct vring_desc *desc, + struct vring_desc_extra *extra, unsigned int i, dma_addr_t addr, unsigned int len, - u16 flags, - bool indirect) + u16 flags) { - struct vring_virtqueue *vring = to_vvq(vq); - struct vring_desc_extra *extra = vring->split.desc_extra; u16 next; desc[i].flags = cpu_to_virtio16(vq->vdev, flags); desc[i].addr = cpu_to_virtio64(vq->vdev, addr); desc[i].len = cpu_to_virtio32(vq->vdev, len); - if (!indirect) { - next = extra[i].next; - desc[i].next = cpu_to_virtio16(vq->vdev, next); + extra[i].addr = addr; + extra[i].len = len; + extra[i].flags = flags; + + next = extra[i].next; - extra[i].addr = addr; - extra[i].len = len; - extra[i].flags = flags; - } else - next = virtio16_to_cpu(vq->vdev, desc[i].next); + desc[i].next = cpu_to_virtio16(vq->vdev, next); return next; } @@ -551,6 +539,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); + struct vring_desc_extra *extra; struct scatterlist *sg; struct vring_desc *desc; unsigned int i, n, avail, descs_used, prev, err_idx; @@ -586,9 +575,11 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Set up rest to use this indirect table. */ i = 0; descs_used = 1; + extra = (struct vring_desc_extra *)&desc[total_sg]; } else { indirect = false; desc = vq->split.vring.desc; + extra = vq->split.desc_extra; i = head; descs_used = total_sg; } @@ -618,9 +609,8 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length, - VRING_DESC_F_NEXT, - indirect); + i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, sg->length, + VRING_DESC_F_NEXT); } } for (; n < (out_sgs + in_sgs); n++) { @@ -634,11 +624,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(_vq, desc, i, addr, + i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, sg->length, VRING_DESC_F_NEXT | - VRING_DESC_F_WRITE, - indirect); + VRING_DESC_F_WRITE); } } /* Last one doesn't continue. */ @@ -660,10 +649,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, } virtqueue_add_desc_split(_vq, vq->split.vring.desc, + vq->split.desc_extra, head, addr, total_sg * sizeof(struct vring_desc), - VRING_DESC_F_INDIRECT, - false); + VRING_DESC_F_INDIRECT); } /* We're using some buffers from the free list. */ @@ -716,11 +705,8 @@ unmap_release: for (n = 0; n < total_sg; n++) { if (i == err_idx) break; - if (indirect) { - vring_unmap_one_split_indirect(vq, &desc[i]); - i = virtio16_to_cpu(_vq->vdev, desc[i].next); - } else - i = vring_unmap_one_split(vq, i); + + i = vring_unmap_one_split(vq, &extra[i]); } free_indirect: @@ -765,22 +751,25 @@ static bool virtqueue_kick_prepare_split(struct virtqueue *_vq) static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, void **ctx) { + struct vring_desc_extra *extra; unsigned int i, j; __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); /* Clear data ptr. */ vq->split.desc_state[head].data = NULL; + extra = vq->split.desc_extra; + /* Put back on free list: unmap first-level descriptors and find end */ i = head; while (vq->split.vring.desc[i].flags & nextflag) { - vring_unmap_one_split(vq, i); + vring_unmap_one_split(vq, &extra[i]); i = vq->split.desc_extra[i].next; vq->vq.num_free++; } - vring_unmap_one_split(vq, i); + vring_unmap_one_split(vq, &extra[i]); vq->split.desc_extra[i].next = vq->free_head; vq->free_head = head; @@ -790,21 +779,24 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, if (vq->indirect) { struct vring_desc *indir_desc = vq->split.desc_state[head].indir_desc; - u32 len; + u32 len, num; /* Free the indirect table, if any, now that it's unmapped. */ if (!indir_desc) return; - len = vq->split.desc_extra[head].len; BUG_ON(!(vq->split.desc_extra[head].flags & VRING_DESC_F_INDIRECT)); BUG_ON(len == 0 || len % sizeof(struct vring_desc)); + num = len / sizeof(struct vring_desc); + + extra = (struct vring_desc_extra *)&indir_desc[num]; + if (vring_need_unmap_buffer(vq)) { - for (j = 0; j < len / sizeof(struct vring_desc); j++) - vring_unmap_one_split_indirect(vq, &indir_desc[j]); + for (j = 0; j < num; j++) + vring_unmap_one_split(vq, &extra[j]); } kfree(indir_desc); -- 2.51.0 From aaa789843a93faa7e2bb309b7647f1cc0a083158 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:18 +0800 Subject: [PATCH 08/16] virtio_ring: packed: record extras for indirect buffers The subsequent commit needs to know whether every indirect buffer is premapped or not. So we need to introduce an extra struct for every indirect buffer to record this info. Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-4-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/virtio/virtio_ring.c | 60 +++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 405d5a348795..cfe70c40f630 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -78,7 +78,11 @@ struct vring_desc_state_split { struct vring_desc_state_packed { void *data; /* Data for callback. */ - struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */ + + /* Indirect desc table and extra table, if any. These two will be + * allocated together. So we won't stress more to the memory allocator. + */ + struct vring_packed_desc *indir_desc; u16 num; /* Descriptor list length. */ u16 last; /* The last desc state in a list. */ }; @@ -1238,27 +1242,12 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, } } -static void vring_unmap_desc_packed(const struct vring_virtqueue *vq, - const struct vring_packed_desc *desc) -{ - u16 flags; - - if (!vring_need_unmap_buffer(vq)) - return; - - flags = le16_to_cpu(desc->flags); - - dma_unmap_page(vring_dma_dev(vq), - le64_to_cpu(desc->addr), - le32_to_cpu(desc->len), - (flags & VRING_DESC_F_WRITE) ? - DMA_FROM_DEVICE : DMA_TO_DEVICE); -} - static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg, gfp_t gfp) { + struct vring_desc_extra *extra; struct vring_packed_desc *desc; + int i, size; /* * We require lowmem mappings for the descriptors because @@ -1267,7 +1256,16 @@ static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg, */ gfp &= ~__GFP_HIGHMEM; - desc = kmalloc_array(total_sg, sizeof(struct vring_packed_desc), gfp); + size = (sizeof(*desc) + sizeof(*extra)) * total_sg; + + desc = kmalloc(size, gfp); + if (!desc) + return NULL; + + extra = (struct vring_desc_extra *)&desc[total_sg]; + + for (i = 0; i < total_sg; i++) + extra[i].next = i + 1; return desc; } @@ -1280,6 +1278,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, void *data, gfp_t gfp) { + struct vring_desc_extra *extra; struct vring_packed_desc *desc; struct scatterlist *sg; unsigned int i, n, err_idx; @@ -1291,6 +1290,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, if (!desc) return -ENOMEM; + extra = (struct vring_desc_extra *)&desc[total_sg]; + if (unlikely(vq->vq.num_free < 1)) { pr_debug("Can't add buf len 1 - avail = 0\n"); kfree(desc); @@ -1312,6 +1313,13 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, 0 : VRING_DESC_F_WRITE); desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(sg->length); + + if (unlikely(vq->use_dma_api)) { + extra[i].addr = addr; + extra[i].len = sg->length; + extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE; + } + i++; } } @@ -1381,7 +1389,7 @@ unmap_release: err_idx = i; for (i = 0; i < err_idx; i++) - vring_unmap_desc_packed(vq, &desc[i]); + vring_unmap_extra_packed(vq, &extra[i]); free_desc: kfree(desc); @@ -1617,7 +1625,8 @@ static void detach_buf_packed(struct vring_virtqueue *vq, } if (vq->indirect) { - u32 len; + struct vring_desc_extra *extra; + u32 len, num; /* Free the indirect table, if any, now that it's unmapped. */ desc = state->indir_desc; @@ -1626,9 +1635,12 @@ static void detach_buf_packed(struct vring_virtqueue *vq, if (vring_need_unmap_buffer(vq)) { len = vq->packed.desc_extra[id].len; - for (i = 0; i < len / sizeof(struct vring_packed_desc); - i++) - vring_unmap_desc_packed(vq, &desc[i]); + num = len / sizeof(struct vring_packed_desc); + + extra = (struct vring_desc_extra *)&desc[num]; + + for (i = 0; i < num; i++) + vring_unmap_extra_packed(vq, &extra[i]); } kfree(desc); state->indir_desc = NULL; -- 2.51.0 From c7e1b422afac5385832297af8ad86e63742c6e40 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:19 +0800 Subject: [PATCH 09/16] virtio_ring: perform premapped operations based on per-buffer The current configuration sets the virtqueue (vq) to premapped mode, implying that all buffers submitted to this queue must be mapped ahead of time. This presents a challenge for the virtnet send queue (sq): the virtnet driver would be required to keep track of dma information for vq size * 17, which can be substantial. However, if the premapped mode were applied on a per-buffer basis, the complexity would be greatly reduced. With AF_XDP enabled, AF_XDP buffers would become premapped, while kernel skb buffers could remain unmapped. And consider that some sgs are not generated by the virtio driver, that may be passed from the block stack. So we can not change the sgs, new APIs are the better way. So we pass the new argument 'premapped' to indicate the buffers submitted to virtio are premapped in advance. Additionally, DMA unmap operations for these buffers will be bypassed. Suggested-by: Jason Wang Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-5-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/virtio/virtio_ring.c | 101 ++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 48 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index cfe70c40f630..fefa85a5e6b6 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -300,9 +300,10 @@ static bool vring_use_dma_api(const struct virtio_device *vdev) return false; } -static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring) +static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring, + const struct vring_desc_extra *extra) { - return vring->use_dma_api && !vring->premapped; + return vring->use_dma_api && (extra->addr != DMA_MAPPING_ERROR); } size_t virtio_max_dma_size(const struct virtio_device *vdev) @@ -372,13 +373,17 @@ static struct device *vring_dma_dev(const struct vring_virtqueue *vq) /* Map one sg entry. */ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, - enum dma_data_direction direction, dma_addr_t *addr) + enum dma_data_direction direction, dma_addr_t *addr, + u32 *len, bool premapped) { - if (vq->premapped) { + if (premapped) { *addr = sg_dma_address(sg); + *len = sg_dma_len(sg); return 0; } + *len = sg->length; + if (!vq->use_dma_api) { /* * If DMA is not used, KMSAN doesn't know that the scatterlist @@ -465,7 +470,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { - if (!vring_need_unmap_buffer(vq)) + if (!vring_need_unmap_buffer(vq, extra)) goto out; dma_unmap_page(vring_dma_dev(vq), @@ -514,7 +519,7 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, unsigned int i, dma_addr_t addr, unsigned int len, - u16 flags) + u16 flags, bool premapped) { u16 next; @@ -522,7 +527,7 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, desc[i].addr = cpu_to_virtio64(vq->vdev, addr); desc[i].len = cpu_to_virtio32(vq->vdev, len); - extra[i].addr = addr; + extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr; extra[i].len = len; extra[i].flags = flags; @@ -540,6 +545,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, unsigned int in_sgs, void *data, void *ctx, + bool premapped, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -605,38 +611,41 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, for (n = 0; n < out_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; + u32 len; - if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr)) + if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr, &len, premapped)) goto unmap_release; prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, sg->length, - VRING_DESC_F_NEXT); + i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, + VRING_DESC_F_NEXT, + premapped); } } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; + u32 len; - if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr)) + if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr, &len, premapped)) goto unmap_release; prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, - sg->length, + i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, VRING_DESC_F_NEXT | - VRING_DESC_F_WRITE); + VRING_DESC_F_WRITE, + premapped); } } /* Last one doesn't continue. */ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); - if (!indirect && vring_need_unmap_buffer(vq)) + if (!indirect && vring_need_unmap_buffer(vq, &extra[prev])) vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= ~VRING_DESC_F_NEXT; @@ -645,18 +654,14 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, dma_addr_t addr = vring_map_single( vq, desc, total_sg * sizeof(struct vring_desc), DMA_TO_DEVICE); - if (vring_mapping_error(vq, addr)) { - if (vq->premapped) - goto free_indirect; - + if (vring_mapping_error(vq, addr)) goto unmap_release; - } virtqueue_add_desc_split(_vq, vq->split.vring.desc, vq->split.desc_extra, head, addr, total_sg * sizeof(struct vring_desc), - VRING_DESC_F_INDIRECT); + VRING_DESC_F_INDIRECT, false); } /* We're using some buffers from the free list. */ @@ -713,7 +718,6 @@ unmap_release: i = vring_unmap_one_split(vq, &extra[i]); } -free_indirect: if (indirect) kfree(desc); @@ -798,7 +802,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, extra = (struct vring_desc_extra *)&indir_desc[num]; - if (vring_need_unmap_buffer(vq)) { + if (vq->use_dma_api) { for (j = 0; j < num; j++) vring_unmap_one_split(vq, &extra[j]); } @@ -1232,7 +1236,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { - if (!vring_need_unmap_buffer(vq)) + if (!vring_need_unmap_buffer(vq, extra)) return; dma_unmap_page(vring_dma_dev(vq), @@ -1276,12 +1280,13 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, unsigned int out_sgs, unsigned int in_sgs, void *data, + bool premapped, gfp_t gfp) { struct vring_desc_extra *extra; struct vring_packed_desc *desc; struct scatterlist *sg; - unsigned int i, n, err_idx; + unsigned int i, n, err_idx, len; u16 head, id; dma_addr_t addr; @@ -1306,17 +1311,18 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, for (n = 0; n < out_sgs + in_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { if (vring_map_one_sg(vq, sg, n < out_sgs ? - DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr)) + DMA_TO_DEVICE : DMA_FROM_DEVICE, + &addr, &len, premapped)) goto unmap_release; desc[i].flags = cpu_to_le16(n < out_sgs ? 0 : VRING_DESC_F_WRITE); desc[i].addr = cpu_to_le64(addr); - desc[i].len = cpu_to_le32(sg->length); + desc[i].len = cpu_to_le32(len); if (unlikely(vq->use_dma_api)) { - extra[i].addr = addr; - extra[i].len = sg->length; + extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr; + extra[i].len = len; extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE; } @@ -1328,12 +1334,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, addr = vring_map_single(vq, desc, total_sg * sizeof(struct vring_packed_desc), DMA_TO_DEVICE); - if (vring_mapping_error(vq, addr)) { - if (vq->premapped) - goto free_desc; - + if (vring_mapping_error(vq, addr)) goto unmap_release; - } vq->packed.vring.desc[head].addr = cpu_to_le64(addr); vq->packed.vring.desc[head].len = cpu_to_le32(total_sg * @@ -1391,7 +1393,6 @@ unmap_release: for (i = 0; i < err_idx; i++) vring_unmap_extra_packed(vq, &extra[i]); -free_desc: kfree(desc); END_USE(vq); @@ -1405,12 +1406,13 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, unsigned int in_sgs, void *data, void *ctx, + bool premapped, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); struct vring_packed_desc *desc; struct scatterlist *sg; - unsigned int i, n, c, descs_used, err_idx; + unsigned int i, n, c, descs_used, err_idx, len; __le16 head_flags, flags; u16 head, id, prev, curr, avail_used_flags; int err; @@ -1431,7 +1433,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, if (virtqueue_use_indirect(vq, total_sg)) { err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, - in_sgs, data, gfp); + in_sgs, data, premapped, gfp); if (err != -ENOMEM) { END_USE(vq); return err; @@ -1466,7 +1468,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, dma_addr_t addr; if (vring_map_one_sg(vq, sg, n < out_sgs ? - DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr)) + DMA_TO_DEVICE : DMA_FROM_DEVICE, + &addr, &len, premapped)) goto unmap_release; flags = cpu_to_le16(vq->packed.avail_used_flags | @@ -1478,12 +1481,13 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, desc[i].flags = flags; desc[i].addr = cpu_to_le64(addr); - desc[i].len = cpu_to_le32(sg->length); + desc[i].len = cpu_to_le32(len); desc[i].id = cpu_to_le16(id); if (unlikely(vq->use_dma_api)) { - vq->packed.desc_extra[curr].addr = addr; - vq->packed.desc_extra[curr].len = sg->length; + vq->packed.desc_extra[curr].addr = premapped ? + DMA_MAPPING_ERROR : addr; + vq->packed.desc_extra[curr].len = len; vq->packed.desc_extra[curr].flags = le16_to_cpu(flags); } @@ -1633,7 +1637,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, if (!desc) return; - if (vring_need_unmap_buffer(vq)) { + if (vq->use_dma_api) { len = vq->packed.desc_extra[id].len; num = len / sizeof(struct vring_packed_desc); @@ -2204,14 +2208,15 @@ static inline int virtqueue_add(struct virtqueue *_vq, unsigned int in_sgs, void *data, void *ctx, + bool premapped, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? virtqueue_add_packed(_vq, sgs, total_sg, - out_sgs, in_sgs, data, ctx, gfp) : + out_sgs, in_sgs, data, ctx, premapped, gfp) : virtqueue_add_split(_vq, sgs, total_sg, - out_sgs, in_sgs, data, ctx, gfp); + out_sgs, in_sgs, data, ctx, premapped, gfp); } /** @@ -2245,7 +2250,7 @@ int virtqueue_add_sgs(struct virtqueue *_vq, total_sg++; } return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, - data, NULL, gfp); + data, NULL, false, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_sgs); @@ -2267,7 +2272,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp); + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, false, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); @@ -2289,7 +2294,7 @@ int virtqueue_add_inbuf(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp); + return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); @@ -2313,7 +2318,7 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, void *ctx, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp); + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, false, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); -- 2.51.0 From 3ef66af31feaf5ff5dd73e63b1327789822ed476 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:20 +0800 Subject: [PATCH 10/16] virtio_ring: introduce add api for premapped Two APIs are introduced to submit premapped per-buffers. int virtqueue_add_inbuf_premapped(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, void *ctx, gfp_t gfp); int virtqueue_add_outbuf_premapped(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, gfp_t gfp); Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-6-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/virtio/virtio_ring.c | 48 ++++++++++++++++++++++++++++++++++++ include/linux/virtio.h | 11 +++++++++ 2 files changed, 59 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index fefa85a5e6b6..0842d27886e5 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2276,6 +2276,29 @@ int virtqueue_add_outbuf(struct virtqueue *vq, } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); +/** + * virtqueue_add_outbuf_premapped - expose output buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg readable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Return: + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_outbuf_premapped(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, true, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_outbuf_premapped); + /** * virtqueue_add_inbuf - expose input buffers to other end * @vq: the struct virtqueue we're talking about. @@ -2322,6 +2345,31 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); +/** + * virtqueue_add_inbuf_premapped - expose input buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg writable by other side + * @data: the token identifying the buffer. + * @ctx: extra context for the token + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Return: + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_inbuf_premapped(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + void *ctx, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, true, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_premapped); + /** * virtqueue_dma_dev - get the dma dev * @_vq: the struct virtqueue we're talking about. diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 306137a15d07..13b3f55abca3 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -56,6 +56,17 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, void *ctx, gfp_t gfp); +int virtqueue_add_inbuf_premapped(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + void *ctx, + gfp_t gfp); + +int virtqueue_add_outbuf_premapped(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + gfp_t gfp); + int virtqueue_add_sgs(struct virtqueue *vq, struct scatterlist *sgs[], unsigned int out_sgs, -- 2.51.0 From 31f3cd4e5756b5ae2bbc05e1ea28d2242dd2c03f Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:21 +0800 Subject: [PATCH 11/16] virtio-net: rq submits premapped per-buffer virtio-net rq submits premapped per-buffer by setting sg page to NULL; Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-7-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ec06da34cd10..8aca4a3fc7e8 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -559,6 +559,12 @@ static struct sk_buff *ptr_to_skb(void *ptr) return (struct sk_buff *)((unsigned long)ptr & ~VIRTIO_ORPHAN_FLAG); } +static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) +{ + sg_dma_address(sg) = addr; + sg_dma_len(sg) = len; +} + static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, bool in_napi, struct virtnet_sq_free_stats *stats) { @@ -936,8 +942,7 @@ static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len) addr = dma->addr - sizeof(*dma) + offset; sg_init_table(rq->sg, 1); - rq->sg[0].dma_address = addr; - rq->sg[0].length = len; + sg_fill_dma(rq->sg, addr, len); } static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) @@ -1087,12 +1092,6 @@ static void check_sq_full_and_disable(struct virtnet_info *vi, } } -static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) -{ - sg->dma_address = addr; - sg->length = len; -} - static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, struct receive_queue *rq, void *buf, u32 len) { @@ -1373,7 +1372,8 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue sg_init_table(rq->sg, 1); sg_fill_dma(rq->sg, addr, len); - err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, xsk_buffs[i], gfp); + err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, + xsk_buffs[i], NULL, gfp); if (err) goto err; } @@ -2453,7 +2453,7 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, virtnet_rq_init_one_sg(rq, buf, vi->hdr_len + GOOD_PACKET_LEN); - err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); + err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); @@ -2573,7 +2573,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi, virtnet_rq_init_one_sg(rq, buf, len); ctx = mergeable_len_to_ctx(len + room, headroom); - err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); + err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp); if (err < 0) { virtnet_rq_unmap(rq, buf, 0); put_page(virt_to_head_page(buf)); -- 2.51.0 From 880ebcbe06636c42cfb328039581e177c6cd6ba5 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:22 +0800 Subject: [PATCH 12/16] virtio_ring: remove API virtqueue_set_dma_premapped Now, this API is useless. remove it. Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-8-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 13 ---------- drivers/virtio/virtio_ring.c | 48 ------------------------------------ include/linux/virtio.h | 2 -- 3 files changed, 63 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8aca4a3fc7e8..183ad5e6bef0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -6168,15 +6168,6 @@ err_ctrl: return -ENOMEM; } -static void virtnet_rq_set_premapped(struct virtnet_info *vi) -{ - int i; - - for (i = 0; i < vi->max_queue_pairs; i++) - /* error should never happen */ - BUG_ON(virtqueue_set_dma_premapped(vi->rq[i].vq)); -} - static int init_vqs(struct virtnet_info *vi) { int ret; @@ -6190,10 +6181,6 @@ static int init_vqs(struct virtnet_info *vi) if (ret) goto err_free; - /* disable for big mode */ - if (!vi->big_packets || vi->mergeable_rx_bufs) - virtnet_rq_set_premapped(vi); - cpus_read_lock(); virtnet_set_affinity(vi); cpus_read_unlock(); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 0842d27886e5..8167be01b400 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -180,9 +180,6 @@ struct vring_virtqueue { /* Host publishes avail event idx */ bool event; - /* Do DMA mapping by driver */ - bool premapped; - /* Head of free buffer list. */ unsigned int free_head; /* Number we've added since last sync. */ @@ -2098,7 +2095,6 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->packed_ring = true; vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); - vq->premapped = false; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2691,7 +2687,6 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index, #endif vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); - vq->premapped = false; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2818,49 +2813,6 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, } EXPORT_SYMBOL_GPL(virtqueue_resize); -/** - * virtqueue_set_dma_premapped - set the vring premapped mode - * @_vq: the struct virtqueue we're talking about. - * - * Enable the premapped mode of the vq. - * - * The vring in premapped mode does not do dma internally, so the driver must - * do dma mapping in advance. The driver must pass the dma_address through - * dma_address of scatterlist. When the driver got a used buffer from - * the vring, it has to unmap the dma address. - * - * This function must be called immediately after creating the vq, or after vq - * reset, and before adding any buffers to it. - * - * Caller must ensure we don't call this with other virtqueue operations - * at the same time (except where noted). - * - * Returns zero or a negative error. - * 0: success. - * -EINVAL: too late to enable premapped mode, the vq already contains buffers. - */ -int virtqueue_set_dma_premapped(struct virtqueue *_vq) -{ - struct vring_virtqueue *vq = to_vvq(_vq); - u32 num; - - START_USE(vq); - - num = vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; - - if (num != vq->vq.num_free) { - END_USE(vq); - return -EINVAL; - } - - vq->premapped = true; - - END_USE(vq); - - return 0; -} -EXPORT_SYMBOL_GPL(virtqueue_set_dma_premapped); - /** * virtqueue_reset - detach and recycle all unused buffers * @_vq: the struct virtqueue we're talking about. diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 13b3f55abca3..338e0f5efb4b 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -93,8 +93,6 @@ bool virtqueue_enable_cb(struct virtqueue *vq); unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq); -int virtqueue_set_dma_premapped(struct virtqueue *_vq); - bool virtqueue_poll(struct virtqueue *vq, unsigned); bool virtqueue_enable_cb_delayed(struct virtqueue *vq); -- 2.51.0 From 7db956707f5f57e18c5ebf1681db2923eb6144e1 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:23 +0800 Subject: [PATCH 13/16] virtio_net: refactor the xmit type Because the af-xdp will introduce a new xmit type, so I refactor the xmit type mechanism first. We know both xdp_frame and sk_buff are at least 4 bytes aligned. For the xdp tx, we do not pass any pointer to virtio core as data, we just need to pass the len of the packet. So we will push len to the void pointer. We can make sure the pointer is 4 bytes aligned. And the data structure of AF_XDP also is at least 4 bytes aligned. So the last two bits of the pointers are free, we can't use these to distinguish them. 00 for skb 01 for SKB_ORPHAN 10 for XDP 11 for AF-XDP tx Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-9-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 90 +++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 183ad5e6bef0..539a43777f86 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -45,9 +45,6 @@ module_param(napi_tx, bool, 0644); #define VIRTIO_XDP_TX BIT(0) #define VIRTIO_XDP_REDIR BIT(1) -#define VIRTIO_XDP_FLAG BIT(0) -#define VIRTIO_ORPHAN_FLAG BIT(1) - /* RX packet size EWMA. The average packet size is used to determine the packet * buffer size when refilling RX rings. As the entire RX ring may be refilled * at once, the weight is chosen so that the EWMA will be insensitive to short- @@ -510,6 +507,12 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, struct page *page, void *buf, int len, int truesize); +enum virtnet_xmit_type { + VIRTNET_XMIT_TYPE_SKB, + VIRTNET_XMIT_TYPE_SKB_ORPHAN, + VIRTNET_XMIT_TYPE_XDP, +}; + static int rss_indirection_table_alloc(struct virtio_net_ctrl_rss *rss, u16 indir_table_size) { if (!indir_table_size) { @@ -529,34 +532,29 @@ static void rss_indirection_table_free(struct virtio_net_ctrl_rss *rss) kfree(rss->indirection_table); } -static bool is_xdp_frame(void *ptr) -{ - return (unsigned long)ptr & VIRTIO_XDP_FLAG; -} +/* We use the last two bits of the pointer to distinguish the xmit type. */ +#define VIRTNET_XMIT_TYPE_MASK (BIT(0) | BIT(1)) -static void *xdp_to_ptr(struct xdp_frame *ptr) +static enum virtnet_xmit_type virtnet_xmit_ptr_unpack(void **ptr) { - return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG); -} + unsigned long p = (unsigned long)*ptr; -static struct xdp_frame *ptr_to_xdp(void *ptr) -{ - return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG); -} + *ptr = (void *)(p & ~VIRTNET_XMIT_TYPE_MASK); -static bool is_orphan_skb(void *ptr) -{ - return (unsigned long)ptr & VIRTIO_ORPHAN_FLAG; + return p & VIRTNET_XMIT_TYPE_MASK; } -static void *skb_to_ptr(struct sk_buff *skb, bool orphan) +static void *virtnet_xmit_ptr_pack(void *ptr, enum virtnet_xmit_type type) { - return (void *)((unsigned long)skb | (orphan ? VIRTIO_ORPHAN_FLAG : 0)); + return (void *)((unsigned long)ptr | type); } -static struct sk_buff *ptr_to_skb(void *ptr) +static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data, + enum virtnet_xmit_type type) { - return (struct sk_buff *)((unsigned long)ptr & ~VIRTIO_ORPHAN_FLAG); + return virtqueue_add_outbuf(sq->vq, sq->sg, num, + virtnet_xmit_ptr_pack(data, type), + GFP_ATOMIC); } static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) @@ -568,29 +566,37 @@ static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, bool in_napi, struct virtnet_sq_free_stats *stats) { + struct xdp_frame *frame; + struct sk_buff *skb; unsigned int len; void *ptr; while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) { - if (!is_xdp_frame(ptr)) { - struct sk_buff *skb = ptr_to_skb(ptr); + switch (virtnet_xmit_ptr_unpack(&ptr)) { + case VIRTNET_XMIT_TYPE_SKB: + skb = ptr; pr_debug("Sent skb %p\n", skb); + stats->napi_packets++; + stats->napi_bytes += skb->len; + napi_consume_skb(skb, in_napi); + break; - if (is_orphan_skb(ptr)) { - stats->packets++; - stats->bytes += skb->len; - } else { - stats->napi_packets++; - stats->napi_bytes += skb->len; - } + case VIRTNET_XMIT_TYPE_SKB_ORPHAN: + skb = ptr; + + stats->packets++; + stats->bytes += skb->len; napi_consume_skb(skb, in_napi); - } else { - struct xdp_frame *frame = ptr_to_xdp(ptr); + break; + + case VIRTNET_XMIT_TYPE_XDP: + frame = ptr; stats->packets++; stats->bytes += xdp_get_frame_len(frame); xdp_return_frame(frame); + break; } } netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); @@ -1450,8 +1456,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, skb_frag_size(frag), skb_frag_off(frag)); } - err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1, - xdp_to_ptr(xdpf), GFP_ATOMIC); + err = virtnet_add_outbuf(sq, nr_frags + 1, xdpf, VIRTNET_XMIT_TYPE_XDP); if (unlikely(err)) return -ENOSPC; /* Caller handle free/refcnt */ @@ -3072,8 +3077,9 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) return num_sg; num_sg++; } - return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, - skb_to_ptr(skb, orphan), GFP_ATOMIC); + + return virtnet_add_outbuf(sq, num_sg, skb, + orphan ? VIRTNET_XMIT_TYPE_SKB_ORPHAN : VIRTNET_XMIT_TYPE_SKB); } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -5991,10 +5997,16 @@ static void free_receive_page_frags(struct virtnet_info *vi) static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) { - if (!is_xdp_frame(buf)) + switch (virtnet_xmit_ptr_unpack(&buf)) { + case VIRTNET_XMIT_TYPE_SKB: + case VIRTNET_XMIT_TYPE_SKB_ORPHAN: dev_kfree_skb(buf); - else - xdp_return_frame(ptr_to_xdp(buf)); + break; + + case VIRTNET_XMIT_TYPE_XDP: + xdp_return_frame(buf); + break; + } } static void free_unused_bufs(struct virtnet_info *vi) -- 2.51.0 From 21a4e3ce6dc7b0a3bc882ebe1cb921a40235ddb0 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:24 +0800 Subject: [PATCH 14/16] virtio_net: xsk: bind/unbind xsk for tx This patch implement the logic of bind/unbind xsk pool to sq and rq. Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-10-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 539a43777f86..6cd9fdb23b8a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -295,6 +295,10 @@ struct send_queue { /* Record whether sq is in reset state. */ bool reset; + + struct xsk_buff_pool *xsk_pool; + + dma_addr_t xsk_hdr_dma_addr; }; /* Internal representation of a receive virtqueue */ @@ -495,6 +499,8 @@ struct virtio_net_common_hdr { }; }; +static struct virtio_net_common_hdr xsk_hdr; + static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, struct net_device *dev, @@ -5561,6 +5567,29 @@ unreg: return err; } +static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, + struct send_queue *sq, + struct xsk_buff_pool *pool) +{ + int err, qindex; + + qindex = sq - vi->sq; + + virtnet_tx_pause(vi, sq); + + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf); + if (err) { + netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); + pool = NULL; + } + + sq->xsk_pool = pool; + + virtnet_tx_resume(vi, sq); + + return err; +} + static int virtnet_xsk_pool_enable(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid) @@ -5569,6 +5598,7 @@ static int virtnet_xsk_pool_enable(struct net_device *dev, struct receive_queue *rq; struct device *dma_dev; struct send_queue *sq; + dma_addr_t hdr_dma; int err, size; if (vi->hdr_len > xsk_pool_get_headroom(pool)) @@ -5606,6 +5636,11 @@ static int virtnet_xsk_pool_enable(struct net_device *dev, if (!rq->xsk_buffs) return -ENOMEM; + hdr_dma = virtqueue_dma_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, + DMA_TO_DEVICE, 0); + if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) + return -ENOMEM; + err = xsk_pool_dma_map(pool, dma_dev, 0); if (err) goto err_xsk_map; @@ -5614,11 +5649,24 @@ static int virtnet_xsk_pool_enable(struct net_device *dev, if (err) goto err_rq; + err = virtnet_sq_bind_xsk_pool(vi, sq, pool); + if (err) + goto err_sq; + + /* Now, we do not support tx offload(such as tx csum), so all the tx + * virtnet hdr is zero. So all the tx packets can share a single hdr. + */ + sq->xsk_hdr_dma_addr = hdr_dma; + return 0; +err_sq: + virtnet_rq_bind_xsk_pool(vi, rq, NULL); err_rq: xsk_pool_dma_unmap(pool, 0); err_xsk_map: + virtqueue_dma_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, + DMA_TO_DEVICE, 0); return err; } @@ -5627,19 +5675,24 @@ static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) struct virtnet_info *vi = netdev_priv(dev); struct xsk_buff_pool *pool; struct receive_queue *rq; + struct send_queue *sq; int err; if (qid >= vi->curr_queue_pairs) return -EINVAL; + sq = &vi->sq[qid]; rq = &vi->rq[qid]; pool = rq->xsk_pool; err = virtnet_rq_bind_xsk_pool(vi, rq, NULL); + err |= virtnet_sq_bind_xsk_pool(vi, sq, NULL); xsk_pool_dma_unmap(pool, 0); + virtqueue_dma_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, + vi->hdr_len, DMA_TO_DEVICE, 0); kvfree(rq->xsk_buffs); return err; -- 2.51.0 From 1df5116a41a823f6c6755b1d04062f56ba4ea1e5 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:25 +0800 Subject: [PATCH 15/16] virtio_net: xsk: prevent disable tx napi Since xsk's TX queue is consumed by TX NAPI, if sq is bound to xsk, then we must stop tx napi from being disabled. Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-11-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 6cd9fdb23b8a..d5b8567f77d0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -5090,7 +5090,7 @@ static int virtnet_set_coalesce(struct net_device *dev, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); - int ret, queue_number, napi_weight; + int ret, queue_number, napi_weight, i; bool update_napi = false; /* Can't change NAPI weight if the link is up */ @@ -5119,6 +5119,14 @@ static int virtnet_set_coalesce(struct net_device *dev, return ret; if (update_napi) { + /* xsk xmit depends on the tx napi. So if xsk is active, + * prevent modifications to tx napi. + */ + for (i = queue_number; i < vi->max_queue_pairs; i++) { + if (vi->sq[i].xsk_pool) + return -EBUSY; + } + for (; queue_number < vi->max_queue_pairs; queue_number++) vi->sq[queue_number].napi.weight = napi_weight; } -- 2.51.0 From 89f86675cb0348c9c7acf77263c2359e772a6768 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 12 Nov 2024 09:29:26 +0800 Subject: [PATCH 16/16] virtio_net: xsk: tx: support xmit xsk buffer The driver's tx napi is very important for XSK. It is responsible for obtaining data from the XSK queue and sending it out. At the beginning, we need to trigger tx napi. virtnet_free_old_xmit distinguishes three type ptr(skb, xdp frame, xsk buffer) by the last bits of the pointer. Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Link: https://patch.msgid.link/20241112012928.102478-12-xuanzhuo@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 179 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 168 insertions(+), 11 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d5b8567f77d0..57642bd83b7b 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -83,6 +83,7 @@ struct virtnet_sq_free_stats { u64 bytes; u64 napi_packets; u64 napi_bytes; + u64 xsk; }; struct virtnet_sq_stats { @@ -512,11 +513,13 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, struct sk_buff *curr_skb, struct page *page, void *buf, int len, int truesize); +static void virtnet_xsk_completed(struct send_queue *sq, int num); enum virtnet_xmit_type { VIRTNET_XMIT_TYPE_SKB, VIRTNET_XMIT_TYPE_SKB_ORPHAN, VIRTNET_XMIT_TYPE_XDP, + VIRTNET_XMIT_TYPE_XSK, }; static int rss_indirection_table_alloc(struct virtio_net_ctrl_rss *rss, u16 indir_table_size) @@ -541,6 +544,8 @@ static void rss_indirection_table_free(struct virtio_net_ctrl_rss *rss) /* We use the last two bits of the pointer to distinguish the xmit type. */ #define VIRTNET_XMIT_TYPE_MASK (BIT(0) | BIT(1)) +#define VIRTIO_XSK_FLAG_OFFSET 2 + static enum virtnet_xmit_type virtnet_xmit_ptr_unpack(void **ptr) { unsigned long p = (unsigned long)*ptr; @@ -563,6 +568,11 @@ static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data, GFP_ATOMIC); } +static u32 virtnet_ptr_to_xsk_buff_len(void *ptr) +{ + return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET; +} + static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) { sg_dma_address(sg) = addr; @@ -603,11 +613,27 @@ static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, stats->bytes += xdp_get_frame_len(frame); xdp_return_frame(frame); break; + + case VIRTNET_XMIT_TYPE_XSK: + stats->bytes += virtnet_ptr_to_xsk_buff_len(ptr); + stats->xsk++; + break; } } netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes); } +static void virtnet_free_old_xmit(struct send_queue *sq, + struct netdev_queue *txq, + bool in_napi, + struct virtnet_sq_free_stats *stats) +{ + __free_old_xmit(sq, txq, in_napi, stats); + + if (stats->xsk) + virtnet_xsk_completed(sq, stats->xsk); +} + /* Converting between virtqueue no. and kernel tx/rx queue no. * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq */ @@ -1037,7 +1063,7 @@ static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq, { struct virtnet_sq_free_stats stats = {0}; - __free_old_xmit(sq, txq, in_napi, &stats); + virtnet_free_old_xmit(sq, txq, in_napi, &stats); /* Avoid overhead when no packets have been processed * happens when called speculatively from start_xmit. @@ -1399,6 +1425,113 @@ err: return err; } +static void *virtnet_xsk_to_ptr(u32 len) +{ + unsigned long p; + + p = len << VIRTIO_XSK_FLAG_OFFSET; + + return virtnet_xmit_ptr_pack((void *)p, VIRTNET_XMIT_TYPE_XSK); +} + +static int virtnet_xsk_xmit_one(struct send_queue *sq, + struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + struct virtnet_info *vi; + dma_addr_t addr; + + vi = sq->vq->vdev->priv; + + addr = xsk_buff_raw_get_dma(pool, desc->addr); + xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len); + + sg_init_table(sq->sg, 2); + sg_fill_dma(sq->sg, sq->xsk_hdr_dma_addr, vi->hdr_len); + sg_fill_dma(sq->sg + 1, addr, desc->len); + + return virtqueue_add_outbuf_premapped(sq->vq, sq->sg, 2, + virtnet_xsk_to_ptr(desc->len), + GFP_ATOMIC); +} + +static int virtnet_xsk_xmit_batch(struct send_queue *sq, + struct xsk_buff_pool *pool, + unsigned int budget, + u64 *kicks) +{ + struct xdp_desc *descs = pool->tx_descs; + bool kick = false; + u32 nb_pkts, i; + int err; + + budget = min_t(u32, budget, sq->vq->num_free); + + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget); + if (!nb_pkts) + return 0; + + for (i = 0; i < nb_pkts; i++) { + err = virtnet_xsk_xmit_one(sq, pool, &descs[i]); + if (unlikely(err)) { + xsk_tx_completed(sq->xsk_pool, nb_pkts - i); + break; + } + + kick = true; + } + + if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) + (*kicks)++; + + return i; +} + +static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool, + int budget) +{ + struct virtnet_info *vi = sq->vq->vdev->priv; + struct virtnet_sq_free_stats stats = {}; + struct net_device *dev = vi->dev; + u64 kicks = 0; + int sent; + + /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of + * free_old_xmit(). + */ + __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats); + + if (stats.xsk) + xsk_tx_completed(sq->xsk_pool, stats.xsk); + + sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks); + + if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq)) + check_sq_full_and_disable(vi, vi->dev, sq); + + u64_stats_update_begin(&sq->stats.syncp); + u64_stats_add(&sq->stats.packets, stats.packets); + u64_stats_add(&sq->stats.bytes, stats.bytes); + u64_stats_add(&sq->stats.kicks, kicks); + u64_stats_add(&sq->stats.xdp_tx, sent); + u64_stats_update_end(&sq->stats.syncp); + + if (xsk_uses_need_wakeup(pool)) + xsk_set_tx_need_wakeup(pool); + + return sent; +} + +static void xsk_wakeup(struct send_queue *sq) +{ + if (napi_if_scheduled_mark_missed(&sq->napi)) + return; + + local_bh_disable(); + virtqueue_napi_schedule(&sq->napi, sq->vq); + local_bh_enable(); +} + static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag) { struct virtnet_info *vi = netdev_priv(dev); @@ -1412,14 +1545,19 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag) sq = &vi->sq[qid]; - if (napi_if_scheduled_mark_missed(&sq->napi)) - return 0; + xsk_wakeup(sq); + return 0; +} - local_bh_disable(); - virtqueue_napi_schedule(&sq->napi, sq->vq); - local_bh_enable(); +static void virtnet_xsk_completed(struct send_queue *sq, int num) +{ + xsk_tx_completed(sq->xsk_pool, num); - return 0; + /* If this is called by rx poll, start_xmit and xdp xmit we should + * wakeup the tx napi to consume the xsk tx queue, because the tx + * interrupt may not be triggered. + */ + xsk_wakeup(sq); } static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, @@ -1535,8 +1673,8 @@ static int virtnet_xdp_xmit(struct net_device *dev, } /* Free up any pending old buffers before queueing new ones. */ - __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), - false, &stats); + virtnet_free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), + false, &stats); for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; @@ -2993,7 +3131,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) struct virtnet_info *vi = sq->vq->vdev->priv; unsigned int index = vq2txq(sq->vq); struct netdev_queue *txq; - int opaque; + int opaque, xsk_done = 0; bool done; if (unlikely(is_xdp_raw_buffer_queue(vi, index))) { @@ -3005,7 +3143,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) txq = netdev_get_tx_queue(vi->dev, index); __netif_tx_lock(txq, raw_smp_processor_id()); virtqueue_disable_cb(sq->vq); - free_old_xmit(sq, txq, !!budget); + + if (sq->xsk_pool) + xsk_done = virtnet_xsk_xmit(sq, sq->xsk_pool, budget); + else + free_old_xmit(sq, txq, !!budget); if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) { if (netif_tx_queue_stopped(txq)) { @@ -3016,6 +3158,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget) netif_tx_wake_queue(txq); } + if (xsk_done >= budget) { + __netif_tx_unlock(txq); + return budget; + } + opaque = virtqueue_enable_cb_prepare(sq->vq); done = napi_complete_done(napi, 0); @@ -6058,6 +6205,12 @@ static void free_receive_page_frags(struct virtnet_info *vi) static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) { + struct virtnet_info *vi = vq->vdev->priv; + struct send_queue *sq; + int i = vq2rxq(vq); + + sq = &vi->sq[i]; + switch (virtnet_xmit_ptr_unpack(&buf)) { case VIRTNET_XMIT_TYPE_SKB: case VIRTNET_XMIT_TYPE_SKB_ORPHAN: @@ -6067,6 +6220,10 @@ static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) case VIRTNET_XMIT_TYPE_XDP: xdp_return_frame(buf); break; + + case VIRTNET_XMIT_TYPE_XSK: + xsk_tx_completed(sq->xsk_pool, 1); + break; } } -- 2.51.0