当要发送的IP数据报的长度超出了最大传输单位MTU,且允许分片时,就会进行IP分片。通常,使用UDP协议发送的数据报很容易导致IP分片,而TCP协议是基于流的传输,通常不会产生分片。
IP数据报被分片以后,各分片(fragment)分别组成一个具有IP首部的分组,并各自独立地选择路由,在其分别抵达目的主机后,目的主机的IP层会在传送给传输层之前将接收到的所有分片重装成一个IP数据报。可以怎么理解,IP数据报是IP层端到端的传输单元(在分片之前和重组之后),分组是指在IP层和链路层之间传送的数据单元。一个分组可以是一个完整的IP数据报也可以是IP数据报的一个分片。而分片对传输层是透明的。
IP数据报输出时的分片以及输入IP数据报分片的组装涉及以下文件:
net/ipv4/ip_output.c IP数据报的输出
net/ipv4/ip_fragment.c IP数据报分片的组装
系统参数
ipfrag_high_thresh,可用于组装IP数据报的内存上限值,在进行组装IP分片时,如果占用的缓存超过了此参数,会调用ip_evictor()进行垃圾收集
ipfrag_low_thresh,可用于组装IP数据报的内存下限值,在调用ip_evictor()进行垃圾收集时,一旦组装IP分片所占用的缓存下降到此参数,就会中止垃圾收集
ipfrag_time,待组装IP分片在内存中允许保留的时间,默认值为30s
ipfrag_secret_interval,定时重组ipq散列表的时间间隔,默认值为600s
ip_finish_output()将数据报发送出去之前,该数据报包括本地发送的数据报也包括转发的数据报,会对skb缓存区的数据长度进行检测,如果超出了输出设备的MTU值,且又符合其他一些条件,则调用ip_fragment()对数据报进行分片;否则直接调用ip_finish_output2()输出到数据链路层
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm != NULL) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
#endif
if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
return ip_fragment(skb, ip_finish_output2);
else
return ip_finish_output2(skb);
}
目前有两种分片的处理方式:快速分片和慢速分片。整个分片过程需要完成的工作,除了将三层的有效负载根据MTU分成一个一个片段之外,还需要为每个分片设置IP首部,更新校验和等,在快速分片中,将数据分割成片段已经由传输层完成,三层只需将这些片段组成IP分片;而慢速分片则需要完成全部的工作,即对一个完整的IP数据报的MTU值循环进行分片,直至完成。
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
* a block of the data of the original IP data part) that will yet fit in a
* single device frame, and queue such a frame for sending.
*/
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
{
struct iphdr *iph;
int raw = 0;
int ptr;
struct net_device *dev;
struct sk_buff *skb2;
unsigned int mtu, hlen, left, len, ll_rs, pad;
int offset;
__be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
int err = 0;
dev = rt->u.dst.dev;
/*
* Point into the IP datagram header.
*/
iph = ip_hdr(skb);
if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(ip_skb_dst_mtu(skb)));
kfree_skb(skb);
return -EMSGSIZE;
}
/*
* Setup starting values.
*/
hlen = iph->ihl * 4;
mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
* one, it is not prohibited. In this case fall back to copying.
*
* LATER: this step can be merged to real generation of fragments,
* we can switch to copy when see the first bad fragment.
*/
if (skb_has_frags(skb)) {
struct sk_buff *frag, *frag2;
int first_len = skb_pagelen(skb);
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
(iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
skb_cloned(skb))
goto slow_path;
skb_walk_frags(skb, frag) {
/* Correct geometry. */
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) < hlen)
goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
goto slow_path_clean;
BUG_ON(frag->sk);
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
}
skb->truesize -= frag->truesize;
}
/* Everything is OK. Generate! */
err = 0;
offset = 0;
frag = skb_shinfo(skb)->frag_list;
skb_frag_list_init(skb);
skb->data_len = first_len - skb_headlen(skb);
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off = htons(IP_MF);
ip_send_check(iph);
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
if (frag) {
frag->ip_summed = CHECKSUM_NONE;
skb_reset_transport_header(frag);
__skb_push(frag, hlen);
skb_reset_network_header(frag);
memcpy(skb_network_header(frag), iph, hlen);
iph = ip_hdr(frag);
iph->tot_len = htons(frag->len);
ip_copy_metadata(frag, skb);
if (offset == 0)
ip_options_fragment(frag);
offset += skb->len - hlen;
iph->frag_off = htons(offset>>3);
if (frag->next != NULL)
iph->frag_off |= htons(IP_MF);
/* Ready, complete checksum */
ip_send_check(iph);
}
err = output(skb);
if (!err)
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
if (err || !frag)
break;
skb = frag;
frag = skb->next;
skb->next = NULL;
}
if (err == 0) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return 0;
}
while (frag) {
skb = frag->next;
kfree_skb(frag);
frag = skb;
}
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
slow_path_clean:
skb_walk_frags(skb, frag2) {
if (frag2 == frag)
break;
frag2->sk = NULL;
frag2->destructor = NULL;
skb->truesize += frag2->truesize;
}
}
slow_path:
left = skb->len - hlen; /* Space per frame */
ptr = raw + hlen; /* Where to start from */
/* for bridged IP traffic encapsulated inside f.e. a vlan header,
* we need to make room for the encapsulating header
*/
pad = nf_bridge_pad(skb);
ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
mtu -= pad;
/*
* Fragment the datagram.
*/
offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
not_last_frag = iph->frag_off & htons(IP_MF);
/*
* Keep copying data until we run out.
*/
while (left > 0) {
len = left;
/* IF: it doesn't fit, use 'mtu' - the data space left */
if (len > mtu)
len = mtu;
/* IF: we are not sending upto and including the packet end
then align the next start on an eight byte boundary */
if (len < left) {
len &= ~7;
}
/*
* Allocate buffer.
*/
if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
err = -ENOMEM;
goto fail;
}
/*
* Set up data on packet
*/
ip_copy_metadata(skb2, skb);
skb_reserve(skb2, ll_rs);
skb_put(skb2, len + hlen);
skb_reset_network_header(skb2);
skb2->transport_header = skb2->network_header + hlen;
/*
* Charge the memory for the fragment to any owner
* it might possess
*/
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
/*
* Copy the packet header into the new buffer.
*/
skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
/*
* Copy a block of the IP datagram.
*/
if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
BUG();
left -= len;
/*
* Fill in the new header fields.
*/
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
* on the initial skb, so that all the following fragments
* will inherit fixed options.
*/
if (offset == 0)
ip_options_fragment(skb);
/*
* Added AC : If we are fragmenting a fragment that's not the
* last fragment then keep MF on each bit
*/
if (left > 0 || not_last_frag)
iph->frag_off |= htons(IP_MF);
ptr += len;
offset += len;
/*
* Put this fragment into the sending queue.
*/
iph->tot_len = htons(len + hlen);
ip_send_check(iph);
err = output(skb2);
if (err)
goto fail;
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
}
kfree_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return err;
fail:
kfree_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
}
在接收方,一个由发送方发出的原始IP数据报,其所有分片将被重新组合,然后才能提交到上层协议。每一个将被重组的IP数据报都用一个ipq结构实例来表示,因此先来看看ipq这个非常重要的结构。
为了能高效地组装分片,用于保存分片的数据结构必须能做到以下几点:
1、快速定位属于某一个数据报的一组分组
2、在属于某一个数据报的一组分片中快速插入新的分片
3、有效地判断一个数据报的所有分片是否已经全部接收
4、具有组装超时机制,如果在重组完成之前定时器溢出,则删除该数据报的所有内容
struct inet_frag_queue {
struct hlist_node list;
struct netns_frags *net;
struct list_head lru_list; /* lru list member */
spinlock_t lock;
atomic_t refcnt;
struct timer_list timer; /* when will this queue expire? */
struct sk_buff *fragments; /* list of received fragments */
ktime_t stamp;
int len; /* total length of orig datagram */
int meat;
__u8 last_in; /* first/last segment arrived? */
#define INET_FRAG_COMPLETE 4
#define INET_FRAG_FIRST_IN 2
#define INET_FRAG_LAST_IN 1
};
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
struct inet_frag_queue q;
u32 user;
__be32 saddr;
__be32 daddr;
__be16 id;
u8 protocol;
int iif;
unsigned int rid;
struct inet_peer *peer;
};saddr、daddr、id、protocol来自IP首部,用来唯一确定分片来自哪个IP数据报
上送传输层及分片处理调用
ip_local_deliver
ip_defrag
ip_find
inet_frag_find
ip_frag_queue
ip_local_deliver_finish
原文:http://blog.csdn.net/wangpeihuixyz/article/details/39099333