本文分析基于Linux Kernel 1.2.13
原创作品,转载请标明http://blog.csdn.net/yming0221/article/details/7514017
更多请看专栏,地址http://blog.csdn.net/column/details/linux-kernel-net.html
作者:闫明
注:标题中的”(上)“,”(下)“表示分析过程基于数据包的传递方向:”(上)“表示分析是从底层向上分析、”(下)“表示分析是从上向下分析。
简单分析了链路层之后,上升到网络层来分析,看看链路层是如何为其上层--网络层服务的。其实在驱动程序层和网络层直接还有一层是接口层,叫做驱动程序接口层,用来整合不同的网络设备。接口层的内容会在上下层中提及。这里我们分析网络IP协议的实现原理。
其实现的文件主要是net/inet/ip.c文件中
我们首先分析下ip_init()初始化函数
这个函数是如何被调用的呢?
下面是调用的过程:
首先是在系统启动过程main.c中调用了sock_init()函数
void sock_init(void)//网络栈初始化{int i;printk("Swansea University Computer Society NET3.019\n");/* *Initialize all address (protocol) families. */ for (i = 0; i第三个字段为相应网络协议的处理函数。
第四个字段是一个void指针。
第五个字段是next指针域,用于将该结构连接成链表。
下面是ip_init()函数
/* *IP registers the packet type and then calls the subprotocol initialisers */void ip_init(void)//该函数在af_inet.c文件中{ip_packet_type.type=htons(ETH_P_IP);dev_add_pack(//将网络协议插入IP协议链表,头插法/* So we flush routes when a device is downed */register_netdevice_notifier(//将其插入通知链表/*ip_raw_init();ip_packet_init();ip_tcp_init();ip_udp_init();*/}这里需要说明的是系统采用主动通知的方式,其实现是有赖于notifier_block结构,其定义在notifier.h中struct notifier_block{int (*notifier_call)(unsigned long, void *);struct notifier_block *next;int priority;};对于网卡设备而言,网卡设备的启动和关闭是事件,内核需要得到通知从而采取相应的措施。其原理是:当事件发生时,事件通知者便利某个队列,对队列中感兴趣(符合条件)的被通知者调用被通知者注册是定义的通知处理函数,从而达到让内核做出相应的操作。当硬件缓冲区数据填满后,会执行中断处理程序,以NE 8390网卡为例,由于在ne.c文件中注册中断时的中断处理函数设置如下:
request_irq (dev->irq, ei_interrupt, 0, wordlength==2 ? "ne2000":"ne1000");中断处理函数为ei_interrupt()。执行ei_interrupt()函数时会调用函数ei_recieve(),而ei_recieve()函数会调用netif_rx()函数将以skb_buf的形式发送给上层。当然netif_rx()函数的特点前面分析过,即Bottom Half技术,使得中断处理过程有效的缩短,提高系统的效率。在下半段该函数会调用dev_transmit()函数,而它会调用函数dev_tint()函数,dev_tinit()会调用函数dev_queue_xmit(),这个函数会调用dev->hard_start_xmit函数,该函数指针在ethdev_init()函数中赋值了:
dev->hard_start_xmit = //设备的发送函数,定义在8390.c中最后调用ei_start_xmit()函数将数据包从硬件设备中读出放在skb中,即存放到内核空间中。中断返回后系统会执行下半段,即执行net_bh()函数,该函数会扫描网络协议队列,调用相应的协议的接收函数,IP协议就会调用ip_rcv()
/* *When we are called the queue is ready to grab, the interrupts are *on and hardware can interrupt and queue to the receive queue a we *run with no problems. *This is run as a bottom half after an interrupt handler that does *mark_bh(NET_BH); */ void net_bh(void *tmp){................................... while((skb=skb_dequeue(//取出该数据包所属的协议类型/* *We got a packet ID. Now loop over the "known protocols" *table (which is actually a linked list, but this will *change soon if I get my way- FvK), and forward the packet *to anyone who wants it. * *[FvK didn't get his way but he is right this ought to be *hashed so we typically get a single hit. The speed cost *here is minimal but no doubt adds up at the 4,000+ pkts/second *rate we can hit flat out] */pt_prev = NULL;for (ptype = ptype_base; ptype != NULL; ptype = ptype->next) //遍历ptype_base所指向的网络协议队列{ //判断协议号是否匹配if ((ptype->type == type || ptype->type == htons(ETH_P_ALL)) skb2=skb_clone(skb, GFP_ATOMIC);//复制数据包结构/* *Kick the protocol handler. This should be fast *and efficient code. */if(skb2)pt_prev->func(skb2, skb->dev, pt_prev);//调用相应协议的处理函数,//这里和网络协议的种类有关系//如IP 协议的处理函数就是ip_rcv}/* Remember the current last to do */pt_prev=ptype;}} /* End of protocol list loop */...........................................}IP数据包类型的初始化设置在ip.c中/* *IP protocol layer initialiser */static struct packet_type ip_packet_type ={0,/* MUTTER ntohs(ETH_P_IP),*/NULL,/* All devices */ip_rcv,NULL,NULL,};接下来分析ip_rcv()函数,这是IP层的接收函数,接收来自链路层的数据。
这里首先了解一下IP数据包的首部结构,结构示意图如下:
标志位三位,分别是:保留,DF(可以分片),MF(还有后续分片)。
内核中对应的结构体定义如下(include/linux/ip.h):
/*IP数据包首部结构体*/struct iphdr {#if defined(LITTLE_ENDIAN_BITFIELD)__u8ihl:4,version:4;#elif defined (BIG_ENDIAN_BITFIELD)__u8version:4, ihl:4;#else#error"Please fix "#endif__u8tos;//服务类型__u16tot_len;//总长度__u16id;//标示__u16frag_off;//标志和片偏移__u8ttl;//生存时间__u8protocol;//协议__u16check;//头部校验和__u32saddr;//源地址__u32daddr;//目的地址/*The options start here. */};内核中用于封装网络数据的最重要的数据结构sk_buff定义在include/linux/skbuff.h中:struct sk_buff { struct sk_buff* volatile next;//指针域,指向后继 struct sk_buff* volatile prev;//指针域,指向前驱#if CONFIG_SKB_CHECK intmagic_debug_COOKIE;#endif struct sk_buff* volatile link3;//该指针域用于TCP协议,指向数据包重发队列 struct sock*sk;//该数据指向的套接字 volatile unsigned longwhen;/* used to compute rtt's用于计算往返时间*/ struct timevalstamp; struct device*dev;//标示发送或接收该数据包的接口设备 struct sk_buff*mem_addr;//指向该sk_buff结构指向的内存基地址,用于该数据结构的内存释放 union {//该联合结构用于实现通用struct tcphdr*th;struct ethhdr*eth;//链路层有效struct iphdr*iph;//网络层有效struct udphdr*uh;unsigned char*raw;unsigned longseq;//TCP协议有效,表示该数据包的ACK值 } h; struct iphdr*ip_hdr;/* For IPPROTO_RAW ,指向IP首部指针,用于RAW套接字*/ unsigned longmem_len;//表示该结构的大小和数据帧的总大小 unsigned long len;//表示数据帧大小 unsigned longfraglen;//表示分片个数 struct sk_buff*fraglist;/* Fragment list ,分片数据包队列*/ unsigned longtruesize;//==mem_len unsigned long saddr;//源地址 unsigned long daddr;//目的地址 unsigned longraddr;//数据包的下一站地址 volatile char acked,//==1表示该数据包已经得到确认used,//==1表示该数据包已经被数据包用完,可以释放free,//==1表示该数据包用完后直接释放,不用缓存arp;//==1表示MAC数据帧首部完成,否则表示MAC首部目的硬件地址尚不知晓,需使用ARP协议询问 unsigned chartries,//表示数据包已得到试发送 lock,//表示是否被其他程序使用 localroute,//表示是局域网路由还是广域网路由 pkt_type;//表示数据包的类型,分为,发往本机、广播、多播、发往其他主机#define PACKET_HOST0/* To us */#define PACKET_BROADCAST1#define PACKET_MULTICAST2#define PACKET_OTHERHOST3/* Unmatched promiscuous */ unsigned shortusers;/* User count - see datagram.c (and soon seqpacket.c/stream.c) 使用该数据包的程序数目*/ unsigned shortpkt_class;/* For drivers that need to cache the packet type with the skbuff (new PPP) */#ifdef CONFIG_SLAVE_BALANCING unsigned shortin_dev_queue;//表示数据包是否在设备缓冲队列#endif unsigned longpadding[0];//填充 unsigned chardata[0];//指向数据部分};ip_rcv()函数流程图:
/* *This function receives all incoming IP datagrams. */int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt){struct iphdr *iph = skb->h.iph;struct sock *raw_sk=NULL;unsigned char hash;unsigned char flag = 0;unsigned char opts_p = 0;/* Set iff the packet has options. */struct inet_protocol *ipprot;//每个传输层协议对应一个inet_protocol,用于调用传输层的服务函数static struct options opt; /* since we don't use these yet, and theytake up stack space. */int brd=IS_MYADDR;int is_frag=0;#ifdef CONFIG_IP_FIREWALLint err;#endifip_statistics.IpInReceives++;/* *Tag the ip header of this packet so we can find it */skb->ip_hdr = iph;//设置IP首部指针/* *Is the datagram acceptable? * *1.Length at least the size of an ip header *2.Version of 4 *3.Checksums correctly. [Speed optimisation for later, skip loopback checksums] *(4.We ought to check for IP multicast addresses and undefined types.. does this matter ?) *///IP数据包合法性检查if (skb->lenihlversion != 4 ||skb->lentot_len) || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0){ip_statistics.IpInHdrErrors++;kfree_skb(skb, FREE_WRITE);return(0);}/* *See if the firewall wants to dispose of the packet. */#ifdefCONFIG_IP_FIREWALL//检查防火墙是否阻止该数据包,过滤数据包if ((err=ip_fw_chk(iph,dev,ip_fw_blk_chain,ip_fw_blk_policy, 0))!=1){if(err==-1)icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0, dev);kfree_skb(skb, FREE_WRITE);return 0;}#endif/* *Our transport medium may have padded the buffer out. Now we know it *is IP we can trim to the true length of the frame. */skb->len=ntohs(iph->tot_len);/* *Next analyse the packet for options. Studies show under one packet in *a thousand have options.... */if (iph->ihl != 5)//IP数据报首部存在选项字段{ /* Fast path for the typical optionless IP packet. */memset((char *) if (do_options(iph, opts_p = 1;}/* *Remember if the frame is fragmented. */ //看该数据包是否含有分片,MF和偏移同时为0,则表示无分片,否则是分片,此处有BUG /* 分片的条件 第一个分片MF=1,offset=0 中间分片MF=1,offset!=0 最后分片MF=1,offset!=0 */if(iph->frag_off){if (iph->frag_off /* *Last fragment ? */if (ntohs(iph->frag_off) }/* *Do any IP forwarding required. chk_addr() is expensive -- avoid it someday. * *This is inefficient. While finding out if it is for us we could also compute *the routing table entry. This is where the great unified cache theory comes *in as and when someone implements it * *For most hosts over 99% of packets match the first conditional *and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at *function entry. */if ( iph->daddr != skb->dev->pa_addr return 0;}/* *The packet is for another target. Forward the frame */#ifdef CONFIG_IP_FORWARDip_forward(skb, dev, is_frag);//转发数据报#else/*printk("Machine %lx tried to use us as a forwarder to %lx but we have forwarding disabled!\n",iph->saddr,iph->daddr);*/ip_statistics.IpInAddrErrors++;#endif/* *The forwarder is inefficient and copies the packet. We *free the original now. */kfree_skb(skb, FREE_WRITE);return(0);}#ifdef CONFIG_IP_MULTICAST//多播if(brd==IS_MULTICAST do{if(ip_mc==NULL){kfree_skb(skb, FREE_WRITE);return 0;}if(ip_mc->multiaddr==iph->daddr)break;ip_mc=ip_mc->next;}while(1);}#endif/* *Account for the packet */ #ifdef CONFIG_IP_ACCTip_acct_cnt(iph,dev, ip_acct_chain);#endif/* * Reassemble IP fragments. */if(is_frag)//该数据报是一个分片,进行合并{/* Defragment. Obtain the complete packet if there is one *//*该函数的作用是梳理分片的数据报,如果接收当前分片后,所有分片均已到达*该函数会调用ip_glue()函数进行IP数据报的重组,否则将该IP数据报放到ipq中fragment*字段指向的队列中**/skb=ip_defrag(iph,skb,dev);if(skb==NULL)return 0;skb->dev = dev;iph=skb->h.iph;} /* *Point into the IP datagram, just past the header. */skb->ip_hdr = iph;skb->h.raw += iph->ihl*4;//sk_buff中union类型的h字段永远指向当前正在处理的协议的首部,这里使其指向传输层的首部,用于传输层的处理/* *Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies. */ hash = iph->protocol /* If there maybe a raw socket we must check - if not we don't care less *///处理RAW类型的套接字if((raw_sk=raw_prot.sock_array[hash])!=NULL){struct sock *sknext=NULL;struct sk_buff *skb1;raw_sk=get_sock_raw(raw_sk, hash, iph->saddr, iph->daddr);if(raw_sk)/* Any raw sockets */{do{/* Find the next */sknext=get_sock_raw(raw_sk->next, hash, iph->saddr, iph->daddr);if(sknext)skb1=skb_clone(skb, GFP_ATOMIC);elsebreak;/* One pending raw socket left */if(skb1)raw_rcv(raw_sk, skb1, dev, iph->saddr,iph->daddr);//RAW类型套接字的接收函数raw_sk=sknext;}while(raw_sk!=NULL);/* Here either raw_sk is the last raw socket, or NULL if none *//* We deliver to the last raw socket AFTER the protocol checks as it avoids a surplus copy */}}/* *skb->h.raw now points at the protocol beyond the IP header. */hash = iph->protocol //对所有使用IP协议的上层协议套接字处理for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next){struct sk_buff *skb2;if (ipprot->protocol != iph->protocol)continue; /** See if we need to make a copy of it. This will* only be set if more than one protocol wants it.* and then not for the last one. If there is a pending*raw delivery wait for that*/if (ipprot->copy || raw_sk){skb2 = skb_clone(skb, GFP_ATOMIC);if(skb2==NULL)continue;}else{skb2 = skb;}flag = 1; /** Pass on the datagram to each protocol that wants it,* based on the datagram protocol. We should really* check the protocol handler's return values here...*/ipprot->handler(skb2, dev, opts_p ? }/* * All protocols checked. * If this packet was a broadcast, we may *not* reply to it, since that * causes (proven, grin) ARP storms and a leakage of memory (i.e. all * ICMP reply messages get queued up for transmission...) */if(raw_sk!=NULL)/* Shift to last raw user */raw_rcv(raw_sk, skb, dev, iph->saddr, iph->daddr);else if (!flag)/* Free and report errors */{if (brd != IS_BROADCAST kfree_skb(skb, FREE_WRITE);}return(0);}这里会进一步调用raw_rcv()或者相应协议的ipprot->handler来调用传输层服务函数。下篇会进行简单分析。