[PATCH 10/13] bpf: Add socket assign support

Thu Sep 3 10:57:13 UTC 2020

On 31.08.20 06:03, Khalid Elmously wrote:
> From: Joe Stringer <joe at wand.net.nz>

This patch is missing:

BugLink: https://bugs.launchpad.net/bugs/1887740

> 
> [ upstream commit cf7fbe660f2dbd738ab58aea8e9b0ca6ad232449 ]
> 
> Add support for TPROXY via a new bpf helper, bpf_sk_assign().
> 
> This helper requires the BPF program to discover the socket via a call
> to bpf_sk*_lookup_*(), then pass this socket to the new helper. The
> helper takes its own reference to the socket in addition to any existing
> reference that may or may not currently be obtained for the duration of
> BPF processing. For the destination socket to receive the traffic, the
> traffic must be routed towards that socket via local route. The
> simplest example route is below, but in practice you may want to route
> traffic more narrowly (eg by CIDR):
> 
>   $ ip route add local default dev lo
> 
> This patch avoids trying to introduce an extra bit into the skb->sk, as
> that would require more invasive changes to all code interacting with
> the socket to ensure that the bit is handled correctly, such as all
> error-handling cases along the path from the helper in BPF through to
> the orphan path in the input. Instead, we opt to use the destructor
> variable to switch on the prefetch of the socket.
> 
> Signed-off-by: Joe Stringer <joe at wand.net.nz>
> Signed-off-by: Alexei Starovoitov <ast at kernel.org>
> Acked-by: Martin KaFai Lau <kafai at fb.com>
> Link: https://lore.kernel.org/bpf/20200329225342.16317-2-joe@wand.net.nz
> [ Backport note: Conflict resolution by pulling in remaining uapi enums for
>   helpers, this is not an issue since they are not enabled for any prog type
>   except the sk assign one added here. ]
> Signed-off-by: Daniel Borkmann <daniel at iogearbox.net>
> Signed-off-by: Khalid Elmously <khalid.elmously at canonical.com>
> ---
>  include/net/sock.h             | 11 +++++++++
>  include/uapi/linux/bpf.h       | 44 +++++++++++++++++++++++++++++++++-
>  net/core/filter.c              | 31 ++++++++++++++++++++++++
>  net/core/sock.c                | 11 +++++++++
>  net/ipv4/ip_input.c            |  3 ++-
>  net/ipv6/ip6_input.c           |  3 ++-
>  net/sched/act_bpf.c            |  3 +++
>  tools/include/uapi/linux/bpf.h | 44 +++++++++++++++++++++++++++++++++-
>  8 files changed, 146 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 6c5a3809483e..b754050401d8 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1617,6 +1617,7 @@ void sock_rfree(struct sk_buff *skb);
>  void sock_efree(struct sk_buff *skb);
>  #ifdef CONFIG_INET
>  void sock_edemux(struct sk_buff *skb);
> +void sock_pfree(struct sk_buff *skb);
>  #else
>  #define sock_edemux sock_efree
>  #endif
> @@ -2481,6 +2482,16 @@ void sock_net_set(struct sock *sk, struct net *net)
>  	write_pnet(&sk->sk_net, net);
>  }
>  
> +static inline bool
> +skb_sk_is_prefetched(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_INET
> +	return skb->destructor == sock_pfree;
> +#else
> +	return false;
> +#endif /* CONFIG_INET */
> +}
> +
>  static inline struct sock *skb_steal_sock(struct sk_buff *skb)
>  {
>  	if (skb->sk) {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 50d657641485..a4c16784d645 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2945,6 +2945,46 @@ union bpf_attr {
>   * 		instead of sockets.
>   * 	Return
>   * 		A 8-byte long opaque number.
> + *
> + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
> + * 	Description
> + * 		Return id of cgroup v2 that is ancestor of the cgroup associated
> + * 		with the current task at the *ancestor_level*. The root cgroup
> + * 		is at *ancestor_level* zero and each step down the hierarchy
> + * 		increments the level. If *ancestor_level* == level of cgroup
> + * 		associated with the current task, then return value will be the
> + * 		same as that of **bpf_get_current_cgroup_id**\ ().
> + *
> + * 		The helper is useful to implement policies based on cgroups
> + * 		that are upper in hierarchy than immediate cgroup associated
> + * 		with the current task.
> + *
> + * 		The format of returned id and helper limitations are same as in
> + * 		**bpf_get_current_cgroup_id**\ ().
> + * 	Return
> + * 		The id is returned or 0 in case the id could not be retrieved.
> + *
> + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
> + *	Description
> + *		Assign the *sk* to the *skb*. When combined with appropriate
> + *		routing configuration to receive the packet towards the socket,
> + *		will cause *skb* to be delivered to the specified socket.
> + *		Subsequent redirection of *skb* via  **bpf_redirect**\ (),
> + *		**bpf_clone_redirect**\ () or other methods outside of BPF may
> + *		interfere with successful delivery to the socket.
> + *
> + *		This operation is only valid from TC ingress path.
> + *
> + *		The *flags* argument must be zero.
> + *	Return
> + *		0 on success, or a negative errno in case of failure.
> + *
> + *		* **-EINVAL**		Unsupported flags specified.
> + *		* **-ENOENT**		Socket is unavailable for assignment.
> + *		* **-ENETUNREACH**	Socket is unreachable (wrong netns).
> + *		* **-EOPNOTSUPP**	Unsupported operation, for example a
> + *					call from outside of TC ingress.
> + *		* **-ESOCKTNOSUPPORT**	Socket type not supported (reuseport).
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -3069,7 +3109,9 @@ union bpf_attr {
>  	FN(read_branch_records),	\
>  	FN(get_ns_current_pid_tgid),	\
>  	FN(xdp_output),			\
> -	FN(get_netns_cookie),
> +	FN(get_netns_cookie),		\
> +	FN(get_current_ancestor_cgroup_id),	\
> +	FN(sk_assign),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/net/core/filter.c b/net/core/filter.c
> index be220ee91d1f..4033799dafd5 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5985,6 +5985,35 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
>  	.arg5_type	= ARG_CONST_SIZE,
>  };
>  
> +BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
> +{
> +	if (flags != 0)
> +		return -EINVAL;
> +	if (!skb_at_tc_ingress(skb))
> +		return -EOPNOTSUPP;
> +	if (unlikely(dev_net(skb->dev) != sock_net(sk)))
> +		return -ENETUNREACH;
> +	if (unlikely(sk->sk_reuseport))
> +		return -ESOCKTNOSUPPORT;
> +	if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
> +		return -ENOENT;
> +
> +	skb_orphan(skb);
> +	skb->sk = sk;
> +	skb->destructor = sock_pfree;
> +
> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_sk_assign_proto = {
> +	.func		= bpf_sk_assign,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type      = ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_PTR_TO_SOCK_COMMON,
> +	.arg3_type	= ARG_ANYTHING,
> +};
> +
>  #endif /* CONFIG_INET */
>  
>  bool bpf_helper_changes_pkt_data(void *func)
> @@ -6304,6 +6333,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_skb_ecn_set_ce_proto;
>  	case BPF_FUNC_tcp_gen_syncookie:
>  		return &bpf_tcp_gen_syncookie_proto;
> +	case BPF_FUNC_sk_assign:
> +		return &bpf_sk_assign_proto;
>  #endif
>  	default:
>  		return bpf_base_func_proto(func_id);
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 0adf7a9e5a90..036ba43d044a 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2065,6 +2065,17 @@ void sock_efree(struct sk_buff *skb)
>  }
>  EXPORT_SYMBOL(sock_efree);
>  
> +/* Buffer destructor for prefetch/receive path where reference count may
> + * not be held, e.g. for listen sockets.
> + */
> +#ifdef CONFIG_INET
> +void sock_pfree(struct sk_buff *skb)
> +{
> +	sock_gen_put(skb->sk);
> +}
> +EXPORT_SYMBOL(sock_pfree);
> +#endif /* CONFIG_INET */
> +
>  kuid_t sock_i_uid(struct sock *sk)
>  {
>  	kuid_t uid;
> diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
> index c59a78a267c3..72cbfc79661c 100644
> --- a/net/ipv4/ip_input.c
> +++ b/net/ipv4/ip_input.c
> @@ -494,7 +494,8 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
>  	IPCB(skb)->iif = skb->skb_iif;
>  
>  	/* Must drop socket now because of tproxy. */
> -	skb_orphan(skb);
> +	if (!skb_sk_is_prefetched(skb))
> +		skb_orphan(skb);
>  
>  	return skb;
>  
> diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
> index 3d71c7d6102c..f2336b5c60f4 100644
> --- a/net/ipv6/ip6_input.c
> +++ b/net/ipv6/ip6_input.c
> @@ -263,7 +263,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
>  	rcu_read_unlock();
>  
>  	/* Must drop socket now because of tproxy. */
> -	skb_orphan(skb);
> +	if (!skb_sk_is_prefetched(skb))
> +		skb_orphan(skb);
>  
>  	return skb;
>  err:
> diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
> index 04b7bd4ec751..2327e393fe16 100644
> --- a/net/sched/act_bpf.c
> +++ b/net/sched/act_bpf.c
> @@ -12,6 +12,7 @@
>  #include <linux/bpf.h>
>  
>  #include <net/netlink.h>
> +#include <net/sock.h>
>  #include <net/pkt_sched.h>
>  #include <net/pkt_cls.h>
>  
> @@ -53,6 +54,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
>  		bpf_compute_data_pointers(skb);
>  		filter_res = BPF_PROG_RUN(filter, skb);
>  	}
> +	if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
> +		skb_orphan(skb);
>  	rcu_read_unlock();
>  
>  	/* A BPF program may overwrite the default action opcode.
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 50d657641485..a4c16784d645 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -2945,6 +2945,46 @@ union bpf_attr {
>   * 		instead of sockets.
>   * 	Return
>   * 		A 8-byte long opaque number.
> + *
> + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
> + * 	Description
> + * 		Return id of cgroup v2 that is ancestor of the cgroup associated
> + * 		with the current task at the *ancestor_level*. The root cgroup
> + * 		is at *ancestor_level* zero and each step down the hierarchy
> + * 		increments the level. If *ancestor_level* == level of cgroup
> + * 		associated with the current task, then return value will be the
> + * 		same as that of **bpf_get_current_cgroup_id**\ ().
> + *
> + * 		The helper is useful to implement policies based on cgroups
> + * 		that are upper in hierarchy than immediate cgroup associated
> + * 		with the current task.
> + *
> + * 		The format of returned id and helper limitations are same as in
> + * 		**bpf_get_current_cgroup_id**\ ().
> + * 	Return
> + * 		The id is returned or 0 in case the id could not be retrieved.
> + *
> + * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
> + *	Description
> + *		Assign the *sk* to the *skb*. When combined with appropriate
> + *		routing configuration to receive the packet towards the socket,
> + *		will cause *skb* to be delivered to the specified socket.
> + *		Subsequent redirection of *skb* via  **bpf_redirect**\ (),
> + *		**bpf_clone_redirect**\ () or other methods outside of BPF may
> + *		interfere with successful delivery to the socket.
> + *
> + *		This operation is only valid from TC ingress path.
> + *
> + *		The *flags* argument must be zero.
> + *	Return
> + *		0 on success, or a negative errno in case of failure.
> + *
> + *		* **-EINVAL**		Unsupported flags specified.
> + *		* **-ENOENT**		Socket is unavailable for assignment.
> + *		* **-ENETUNREACH**	Socket is unreachable (wrong netns).
> + *		* **-EOPNOTSUPP**	Unsupported operation, for example a
> + *					call from outside of TC ingress.
> + *		* **-ESOCKTNOSUPPORT**	Socket type not supported (reuseport).
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -3069,7 +3109,9 @@ union bpf_attr {
>  	FN(read_branch_records),	\
>  	FN(get_ns_current_pid_tgid),	\
>  	FN(xdp_output),			\
> -	FN(get_netns_cookie),
> +	FN(get_netns_cookie),		\
> +	FN(get_current_ancestor_cgroup_id),	\
> +	FN(sk_assign),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
>