[3.8.y.z extended stable] Patch "netfilter: push reasm skb through instead of original frag skbs" has been added to staging queue

Kamal Mostafa kamal at canonical.com
Wed Dec 11 22:03:16 UTC 2013


On Wed, 2013-12-11 at 12:09 -0800, Kamal Mostafa wrote:
> This is a note to let you know that I have just added a patch titled
> 
>     netfilter: push reasm skb through instead of original frag skbs
> 
> to the linux-3.8.y-queue branch of the 3.8.y.z extended stable tree 
> which can be found at:
> 
>  http://kernel.ubuntu.com/git?p=ubuntu/linux.git;a=shortlog;h=refs/heads/linux-3.8.y-queue
> 
> This patch is scheduled to be released in version 3.8.13.15.


Oops... On closer inspection, this patch isn't suitable for 3.8-stable,
so I'm dropping it from the 3.8 queue.  Sorry for the noise!

 -Kamal


> If you, or anyone else, feels it should not be added to this tree, please 
> reply to this email.
> 
> For more information about the 3.8.y.z tree, see
> https://wiki.ubuntu.com/Kernel/Dev/ExtendedStable
> 
> Thanks.
> -Kamal
> 
> ------
> 
> From 637a18d24a58603b795ffa3ceac67d49827c8558 Mon Sep 17 00:00:00 2001
> From: Jiri Pirko <jiri at resnulli.us>
> Date: Wed, 6 Nov 2013 17:52:20 +0100
> Subject: netfilter: push reasm skb through instead of original frag skbs
> 
> [ Upstream commit 6aafeef03b9d9ecf255f3a80ed85ee070260e1ae ]
> 
> Pushing original fragments through causes several problems. For example
> for matching, frags may not be matched correctly. Take following
> example:
> 
> <example>
> On HOSTA do:
> ip6tables -I INPUT -p icmpv6 -j DROP
> ip6tables -I INPUT -p icmpv6 -m icmp6 --icmpv6-type 128 -j ACCEPT
> 
> and on HOSTB you do:
> ping6 HOSTA -s2000    (MTU is 1500)
> 
> Incoming echo requests will be filtered out on HOSTA. This issue does
> not occur with smaller packets than MTU (where fragmentation does not happen)
> </example>
> 
> As was discussed previously, the only correct solution seems to be to use
> reassembled skb instead of separete frags. Doing this has positive side
> effects in reducing sk_buff by one pointer (nfct_reasm) and also the reams
> dances in ipvs and conntrack can be removed.
> 
> Future plan is to remove net/ipv6/netfilter/nf_conntrack_reasm.c
> entirely and use code in net/ipv6/reassembly.c instead.
> 
> Signed-off-by: Jiri Pirko <jiri at resnulli.us>
> Acked-by: Julian Anastasov <ja at ssi.bg>
> Signed-off-by: Marcelo Ricardo Leitner <mleitner at redhat.com>
> Signed-off-by: David S. Miller <davem at davemloft.net>
> Signed-off-by: Kamal Mostafa <kamal at canonical.com>
> ---
>  include/linux/skbuff.h                         | 32 ---------------
>  include/net/ip_vs.h                            | 32 +--------------
>  include/net/netfilter/ipv6/nf_defrag_ipv6.h    |  5 +--
>  net/core/skbuff.c                              |  3 --
>  net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  2 +-
>  net/ipv6/netfilter/nf_conntrack_reasm.c        | 19 +--------
>  net/ipv6/netfilter/nf_defrag_ipv6_hooks.c      |  7 +++-
>  net/netfilter/ipvs/ip_vs_core.c                | 55 +-------------------------
>  net/netfilter/ipvs/ip_vs_pe_sip.c              |  8 +---
>  9 files changed, 12 insertions(+), 151 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index db29f78..ffe740d 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -319,11 +319,6 @@ typedef unsigned int sk_buff_data_t;
>  typedef unsigned char *sk_buff_data_t;
>  #endif
> 
> -#if defined(CONFIG_NF_DEFRAG_IPV4) || defined(CONFIG_NF_DEFRAG_IPV4_MODULE) || \
> -    defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE)
> -#define NET_SKBUFF_NF_DEFRAG_NEEDED 1
> -#endif
> -
>  /**
>   *	struct sk_buff - socket buffer
>   *	@next: Next buffer in list
> @@ -356,7 +351,6 @@ typedef unsigned char *sk_buff_data_t;
>   *	@protocol: Packet protocol from driver
>   *	@destructor: Destruct function
>   *	@nfct: Associated connection, if any
> - *	@nfct_reasm: netfilter conntrack re-assembly pointer
>   *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
>   *	@skb_iif: ifindex of device we arrived on
>   *	@tc_index: Traffic control index
> @@ -441,9 +435,6 @@ struct sk_buff {
>  #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
>  	struct nf_conntrack	*nfct;
>  #endif
> -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
> -	struct sk_buff		*nfct_reasm;
> -#endif
>  #ifdef CONFIG_BRIDGE_NETFILTER
>  	struct nf_bridge_info	*nf_bridge;
>  #endif
> @@ -2572,18 +2563,6 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct)
>  		atomic_inc(&nfct->use);
>  }
>  #endif
> -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
> -static inline void nf_conntrack_get_reasm(struct sk_buff *skb)
> -{
> -	if (skb)
> -		atomic_inc(&skb->users);
> -}
> -static inline void nf_conntrack_put_reasm(struct sk_buff *skb)
> -{
> -	if (skb)
> -		kfree_skb(skb);
> -}
> -#endif
>  #ifdef CONFIG_BRIDGE_NETFILTER
>  static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
>  {
> @@ -2602,10 +2581,6 @@ static inline void nf_reset(struct sk_buff *skb)
>  	nf_conntrack_put(skb->nfct);
>  	skb->nfct = NULL;
>  #endif
> -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
> -	nf_conntrack_put_reasm(skb->nfct_reasm);
> -	skb->nfct_reasm = NULL;
> -#endif
>  #ifdef CONFIG_BRIDGE_NETFILTER
>  	nf_bridge_put(skb->nf_bridge);
>  	skb->nf_bridge = NULL;
> @@ -2627,10 +2602,6 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src)
>  	nf_conntrack_get(src->nfct);
>  	dst->nfctinfo = src->nfctinfo;
>  #endif
> -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
> -	dst->nfct_reasm = src->nfct_reasm;
> -	nf_conntrack_get_reasm(src->nfct_reasm);
> -#endif
>  #ifdef CONFIG_BRIDGE_NETFILTER
>  	dst->nf_bridge  = src->nf_bridge;
>  	nf_bridge_get(src->nf_bridge);
> @@ -2642,9 +2613,6 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
>  #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
>  	nf_conntrack_put(dst->nfct);
>  #endif
> -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
> -	nf_conntrack_put_reasm(dst->nfct_reasm);
> -#endif
>  #ifdef CONFIG_BRIDGE_NETFILTER
>  	nf_bridge_put(dst->nf_bridge);
>  #endif
> diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
> index fce8e6b..c358446 100644
> --- a/include/net/ip_vs.h
> +++ b/include/net/ip_vs.h
> @@ -109,7 +109,6 @@ extern int ip_vs_conn_tab_size;
>  struct ip_vs_iphdr {
>  	__u32 len;	/* IPv4 simply where L4 starts
>  			   IPv6 where L4 Transport Header starts */
> -	__u32 thoff_reasm; /* Transport Header Offset in nfct_reasm skb */
>  	__u16 fragoffs; /* IPv6 fragment offset, 0 if first frag (or not frag)*/
>  	__s16 protocol;
>  	__s32 flags;
> @@ -117,34 +116,12 @@ struct ip_vs_iphdr {
>  	union nf_inet_addr daddr;
>  };
> 
> -/* Dependency to module: nf_defrag_ipv6 */
> -#if defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE)
> -static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb)
> -{
> -	return skb->nfct_reasm;
> -}
> -static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset,
> -				      int len, void *buffer,
> -				      const struct ip_vs_iphdr *ipvsh)
> -{
> -	if (unlikely(ipvsh->fragoffs && skb_nfct_reasm(skb)))
> -		return skb_header_pointer(skb_nfct_reasm(skb),
> -					  ipvsh->thoff_reasm, len, buffer);
> -
> -	return skb_header_pointer(skb, offset, len, buffer);
> -}
> -#else
> -static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb)
> -{
> -	return NULL;
> -}
>  static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset,
>  				      int len, void *buffer,
>  				      const struct ip_vs_iphdr *ipvsh)
>  {
>  	return skb_header_pointer(skb, offset, len, buffer);
>  }
> -#endif
> 
>  static inline void
>  ip_vs_fill_ip4hdr(const void *nh, struct ip_vs_iphdr *iphdr)
> @@ -171,19 +148,12 @@ ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, struct ip_vs_iphdr *iphdr)
>  			(struct ipv6hdr *)skb_network_header(skb);
>  		iphdr->saddr.in6 = iph->saddr;
>  		iphdr->daddr.in6 = iph->daddr;
> -		/* ipv6_find_hdr() updates len, flags, thoff_reasm */
> -		iphdr->thoff_reasm = 0;
> +		/* ipv6_find_hdr() updates len, flags */
>  		iphdr->len	 = 0;
>  		iphdr->flags	 = 0;
>  		iphdr->protocol  = ipv6_find_hdr(skb, &iphdr->len, -1,
>  						 &iphdr->fragoffs,
>  						 &iphdr->flags);
> -		/* get proto from re-assembled packet and it's offset */
> -		if (skb_nfct_reasm(skb))
> -			iphdr->protocol = ipv6_find_hdr(skb_nfct_reasm(skb),
> -							&iphdr->thoff_reasm,
> -							-1, NULL, NULL);
> -
>  	} else
>  #endif
>  	{
> diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
> index fd79c9a..17920d8 100644
> --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h
> +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
> @@ -6,10 +6,7 @@ extern void nf_defrag_ipv6_enable(void);
>  extern int nf_ct_frag6_init(void);
>  extern void nf_ct_frag6_cleanup(void);
>  extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user);
> -extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
> -			       struct net_device *in,
> -			       struct net_device *out,
> -			       int (*okfn)(struct sk_buff *));
> +extern void nf_ct_frag6_consume_orig(struct sk_buff *skb);
> 
>  struct inet_frags_ctl;
> 
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 32443eb..21a89b0 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -576,9 +576,6 @@ static void skb_release_head_state(struct sk_buff *skb)
>  #if IS_ENABLED(CONFIG_NF_CONNTRACK)
>  	nf_conntrack_put(skb->nfct);
>  #endif
> -#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
> -	nf_conntrack_put_reasm(skb->nfct_reasm);
> -#endif
>  #ifdef CONFIG_BRIDGE_NETFILTER
>  	nf_bridge_put(skb->nf_bridge);
>  #endif
> diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
> index 137e245..db57221 100644
> --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
> +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
> @@ -248,7 +248,7 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum,
>  		net_notice_ratelimited("ipv6_conntrack_local: packet too short\n");
>  		return NF_ACCEPT;
>  	}
> -	return __ipv6_conntrack_in(dev_net(out), hooknum, skb, in, out, okfn);
> +	return nf_conntrack_in(dev_net(out), PF_INET6, hooknum, skb);
>  }
> 
>  static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
> index d05b6ee..7e9cc1b 100644
> --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
> +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
> @@ -604,31 +604,16 @@ ret_orig:
>  	return skb;
>  }
> 
> -void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
> -			struct net_device *in, struct net_device *out,
> -			int (*okfn)(struct sk_buff *))
> +void nf_ct_frag6_consume_orig(struct sk_buff *skb)
>  {
>  	struct sk_buff *s, *s2;
> -	unsigned int ret = 0;
> 
>  	for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
> -		nf_conntrack_put_reasm(s->nfct_reasm);
> -		nf_conntrack_get_reasm(skb);
> -		s->nfct_reasm = skb;
> -
>  		s2 = s->next;
>  		s->next = NULL;
> -
> -		if (ret != -ECANCELED)
> -			ret = NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s,
> -					     in, out, okfn,
> -					     NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
> -		else
> -			kfree_skb(s);
> -
> +		consume_skb(s);
>  		s = s2;
>  	}
> -	nf_conntrack_put_reasm(skb);
>  }
> 
>  static int nf_ct_net_init(struct net *net)
> diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
> index aacd121..581dd9e 100644
> --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
> +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
> @@ -75,8 +75,11 @@ static unsigned int ipv6_defrag(unsigned int hooknum,
>  	if (reasm == skb)
>  		return NF_ACCEPT;
> 
> -	nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
> -			   (struct net_device *)out, okfn);
> +	nf_ct_frag6_consume_orig(reasm);
> +
> +	NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm,
> +		       (struct net_device *) in, (struct net_device *) out,
> +		       okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
> 
>  	return NF_STOLEN;
>  }
> diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
> index bbbb9a1..c0e060f 100644
> --- a/net/netfilter/ipvs/ip_vs_core.c
> +++ b/net/netfilter/ipvs/ip_vs_core.c
> @@ -1132,12 +1132,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
>  	ip_vs_fill_iph_skb(af, skb, &iph);
>  #ifdef CONFIG_IP_VS_IPV6
>  	if (af == AF_INET6) {
> -		if (!iph.fragoffs && skb_nfct_reasm(skb)) {
> -			struct sk_buff *reasm = skb_nfct_reasm(skb);
> -			/* Save fw mark for coming frags */
> -			reasm->ipvs_property = 1;
> -			reasm->mark = skb->mark;
> -		}
>  		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
>  			int related;
>  			int verdict = ip_vs_out_icmp_v6(skb, &related,
> @@ -1622,12 +1616,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
> 
>  #ifdef CONFIG_IP_VS_IPV6
>  	if (af == AF_INET6) {
> -		if (!iph.fragoffs && skb_nfct_reasm(skb)) {
> -			struct sk_buff *reasm = skb_nfct_reasm(skb);
> -			/* Save fw mark for coming frags. */
> -			reasm->ipvs_property = 1;
> -			reasm->mark = skb->mark;
> -		}
>  		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
>  			int related;
>  			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
> @@ -1679,9 +1667,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
>  		/* sorry, all this trouble for a no-hit :) */
>  		IP_VS_DBG_PKT(12, af, pp, skb, 0,
>  			      "ip_vs_in: packet continues traversal as normal");
> -		if (iph.fragoffs && !skb_nfct_reasm(skb)) {
> +		if (iph.fragoffs) {
>  			/* Fragment that couldn't be mapped to a conn entry
> -			 * and don't have any pointer to a reasm skb
>  			 * is missing module nf_defrag_ipv6
>  			 */
>  			IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
> @@ -1770,38 +1757,6 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
>  #ifdef CONFIG_IP_VS_IPV6
> 
>  /*
> - * AF_INET6 fragment handling
> - * Copy info from first fragment, to the rest of them.
> - */
> -static unsigned int
> -ip_vs_preroute_frag6(unsigned int hooknum, struct sk_buff *skb,
> -		     const struct net_device *in,
> -		     const struct net_device *out,
> -		     int (*okfn)(struct sk_buff *))
> -{
> -	struct sk_buff *reasm = skb_nfct_reasm(skb);
> -	struct net *net;
> -
> -	/* Skip if not a "replay" from nf_ct_frag6_output or first fragment.
> -	 * ipvs_property is set when checking first fragment
> -	 * in ip_vs_in() and ip_vs_out().
> -	 */
> -	if (reasm)
> -		IP_VS_DBG(2, "Fragment recv prop:%d\n", reasm->ipvs_property);
> -	if (!reasm || !reasm->ipvs_property)
> -		return NF_ACCEPT;
> -
> -	net = skb_net(skb);
> -	if (!net_ipvs(net)->enable)
> -		return NF_ACCEPT;
> -
> -	/* Copy stored fw mark, saved in ip_vs_{in,out} */
> -	skb->mark = reasm->mark;
> -
> -	return NF_ACCEPT;
> -}
> -
> -/*
>   *	AF_INET6 handler in NF_INET_LOCAL_IN chain
>   *	Schedule and forward packets from remote clients
>   */
> @@ -1944,14 +1899,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
>  		.priority	= 100,
>  	},
>  #ifdef CONFIG_IP_VS_IPV6
> -	/* After mangle & nat fetch 2:nd fragment and following */
> -	{
> -		.hook		= ip_vs_preroute_frag6,
> -		.owner		= THIS_MODULE,
> -		.pf		= NFPROTO_IPV6,
> -		.hooknum	= NF_INET_PRE_ROUTING,
> -		.priority	= NF_IP6_PRI_NAT_DST + 1,
> -	},
>  	/* After packet filtering, change source only for VS/NAT */
>  	{
>  		.hook		= ip_vs_reply6,
> diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
> index e5920fb..f4cc87f 100644
> --- a/net/netfilter/ipvs/ip_vs_pe_sip.c
> +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
> @@ -64,7 +64,6 @@ static int get_callid(const char *dptr, unsigned int dataoff,
>  static int
>  ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
>  {
> -	struct sk_buff *reasm = skb_nfct_reasm(skb);
>  	struct ip_vs_iphdr iph;
>  	unsigned int dataoff, datalen, matchoff, matchlen;
>  	const char *dptr;
> @@ -78,15 +77,10 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
>  	/* todo: IPv6 fragments:
>  	 *       I think this only should be done for the first fragment. /HS
>  	 */
> -	if (reasm) {
> -		skb = reasm;
> -		dataoff = iph.thoff_reasm + sizeof(struct udphdr);
> -	} else
> -		dataoff = iph.len + sizeof(struct udphdr);
> +	dataoff = iph.len + sizeof(struct udphdr);
> 
>  	if (dataoff >= skb->len)
>  		return -EINVAL;
> -	/* todo: Check if this will mess-up the reasm skb !!! /HS */
>  	retc = skb_linearize(skb);
>  	if (retc < 0)
>  		return retc;
> --
> 1.8.3.2
> 






More information about the kernel-team mailing list