数据包的接收
作者:kendo
Kernel:2.6.12
一、从网卡说起
这并非是一个网卡驱动分析的专门文档,只是对网卡处理数据包的流程进行一个重点的分析。这里以Intel的e100驱动为例进行分析。
大多数网卡都是一个PCI设备,PCI设备都包含了一个标准的配置寄存器,寄存器中,包含了PCI设备的厂商ID、设备ID等等信息,驱动
程序使用来描述这些寄存器的标识符。如下:
1 struct pci_device_id { 2 __u32 vendor, device; 3 __u32 subvendor, subdevice; 4 __u32 class, class_mask; 5 kernel_ulong_t driver_#; 6 };
这样,在驱动程序中,常常就可以看到定义一个struct pci_device_id 类型的数组,告诉内核支持不同类型的
PCI设备的列表,以e100驱动为例:
1 #define INTEL_8255X_ETHERNET_DEVICE(device_id, ich) { 2 PCI_VENDOR_ID_INTEL, device_id, PCI_ANY_ID, PCI_ANY_ID, 3 PCI_CLASS_NETWORK_ETHERNET << 8, 0xFFFF00, ich } 4 5 static struct pci_device_id e100_id_table[] = { 6 INTEL_8255X_ETHERNET_DEVICE(0x1029, 0), 7 INTEL_8255X_ETHERNET_DEVICE(0x1030, 0), 8 INTEL_8255X_ETHERNET_DEVICE(0x1031, 3), 9 …… 10 { 0, } 11 };
在内核中,一个PCI设备,使用struct pci_driver结构来描述
1 struct pci_driver { 2 struct list_head node; 3 char *name; 4 struct module *owner; 5 const struct pci_device_id *id_table; 6 int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); 7 void (*remove) (struct pci_dev *dev); 8 int (*suspend) (struct pci_dev *dev, pm_message_t state); 9 int (*resume) (struct pci_dev *dev); 10 int (*enable_wake) (struct pci_dev *dev, pci_power_t state, int enable); 11 void (*shutdown) (struct pci_dev *dev); 12 13 struct device_driver driver; 14 struct pci_dynids dynids; 15 };
因为在系统引导的时候,PCI设备已经被识别,当内核发现一个已经检测到的设备同驱动注册的id_table中的信息相匹配时,
它就会触发驱动的probe函数,以e100为例:
1 static struct pci_driver e100_driver = { 2 .name = DRV_NAME, 3 .id_table = e100_id_table, 4 .probe = e100_probe, 5 .remove = __devexit_p(e100_remove), 6 #ifdef CONFIG_PM 7 .suspend = e100_suspend, 8 .resume = e100_resume, 9 #endif 10 11 .driver = { 12 .shutdown = e100_shutdown, 13 } 14 15 };
这样,如果系统检测到有与id_table中对应的设备时,就调用驱动的probe函数。
驱动设备在init函数中,调用pci_module_init函数初始化PCI设备e100_driver:
static int __init e100_init_module(void) { if(((1 << debug) - 1) & NETIF_MSG_DRV) { printk(KERN_INFO PFX "%s, %s\n", DRV_DESCRIPTION, DRV_VERSION); printk(KERN_INFO PFX "%s\n", DRV_COPYRIGHT); } return pci_module_init(&e100_driver); }
一切顺利的话,注册的e100_probe函数将被内核调用,这个函数完成两个重要的工作:
1、分配/初始化/注册网络设备;
2、完成PCI设备的I/O区域的分配和映射,以及完成硬件的其它初始化工作;
网络设备使用structnet_device结构来描述,这个结构非常之大,许多重要的参考书籍对它都有较为深入的描述,可以参考《Linux设备驱动程序》中网卡驱动设计的相关章节。我会在后面的内容中,对其重要的成员进行注释;
当probe函数被调用,证明已经发现了我们所支持的网卡,这样,就可以调用register_netdev函数向内核注册网络设备了,注册之前,一般会调用alloc_etherdev为以太网分析一个net_device,然后初始化它的重要成员。
除了向内核注册网络设备之外,探测函数另一项重要的工作就是需要对硬件进行初始化,比如,要访问其I/O区域,需要为I/O区域分配内存区域,然后进行映射,这一步一般的流程是:
1、request_mem_region()
2、ioremap()
对于一般的PCI设备而言,可以调用:
1、pci_request_regions()
2、ioremap()
pci_request_regions函数对PCI的6个寄存器都会调用资源分配函数进行申请(需要判断是I/O端口还是I/O内存),例如:
1 int pci_request_regions(struct pci_dev *pdev, char *res_name) 2 { 3 int i; 4 5 for (i = 0; i < 6; i++) 6 if(pci_request_region(pdev, i, res_name)) 7 goto err_out; 8 return 0;
1 int pci_request_region(struct pci_dev *pdev, int bar, char *res_name) 2 { 3 if (pci_resource_len(pdev, bar) == 0) 4 return 0; 5 6 if (pci_resource_flags(pdev, bar) & IORESOURCE_IO) { 7 if (!request_region(pci_resource_start(pdev, bar), 8 pci_resource_len(pdev, bar), res_name)) 9 goto err_out; 10 } 11 else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) { 12 if (!request_mem_region(pci_resource_start(pdev, bar), 13 pci_resource_len(pdev, bar), res_name)) 14 goto err_out; 15 } 16 17 return 0;
有了这些基础,我们来看设备的探测函数
1 static int __devinit e100_probe(struct pci_dev *pdev, 2 const struct pci_device_id *ent) 3 { 4 struct net_device *netdev; 5 struct nic *nic; 6 int err; 7 8 9 if(!(netdev = alloc_etherdev(sizeof(struct nic)))) { 10 if(((1 << debug) - 1) & NETIF_MSG_PROBE) 11 printk(KERN_ERR PFX "Etherdev alloc failed, abort.\n"); 12 return -ENOMEM; 13 } 14 15 16 netdev->open = e100_open; 17 netdev->stop = e100_close; 18 netdev->hard_start_xmit = e100_xmit_frame; 19 netdev->get_stats = e100_get_stats; 20 netdev->set_multicast_list = e100_set_multicast_list; 21 netdev->set_mac_address = e100_set_mac_address; 22 netdev->change_mtu = e100_change_mtu; 23 netdev->do_ioctl = e100_do_ioctl; 24 SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops); 25 netdev->tx_timeout = e100_tx_timeout; 26 netdev->watchdog_timeo = E100_WATCHDOG_PERIOD; 27 netdev->poll = e100_poll; 28 netdev->weight = E100_NAPI_WEIGHT; 29 #ifdef CONFIG_NET_POLL_CONTROLLER 30 netdev->poll_controller = e100_netpoll; 31 #endif 32 33 strcpy(netdev->name, pci_name(pdev)); 34 35 36 nic = netdev_priv(netdev); 37 38 nic->netdev = netdev; 39 40 nic->pdev = pdev; 41 nic->msg_enable = (1 << debug) - 1; 42 43 44 pci_set_drv#(pdev, netdev); 45 46 47 if((err = pci_enable_device(pdev))) { 48 DPRINTK(PROBE, ERR, "Cannot enable PCI device, aborting.\n"); 49 goto err_out_free_dev; 50 } 51 52 53 if(!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { 54 DPRINTK(PROBE, ERR, "Cannot find proper PCI device " 55 "base address, aborting.\n"); 56 err = -ENODEV; 57 goto err_out_disable_pdev; 58 } 59 60 61 if((err = pci_request_regions(pdev, DRV_NAME))) { 62 DPRINTK(PROBE, ERR, "Cannot obtain PCI resources, aborting.\n"); 63 goto err_out_disable_pdev; 64 } 65 66 67 if((err = pci_set_dma_mask(pdev, 0xFFFFFFFFULL))) { 68 DPRINTK(PROBE, ERR, "No usable DMA configuration, aborting.\n"); 69 goto err_out_free_res; 70 } 71 72 SET_MODULE_OWNER(netdev); 73 SET_NETDEV_DEV(netdev, &pdev->dev); 74 75 76 nic->csr = ioremap(pci_resource_start(pdev, 0), sizeof(struct csr)); 77 if(!nic->csr) { 78 DPRINTK(PROBE, ERR, "Cannot map device registers, aborting.\n"); 79 err = -ENOMEM; 80 goto err_out_free_res; 81 } 82 83 if(ent->driver_#) 84 nic->flags |= ich; 85 else 86 nic->flags &= ~ich; 87 88 89 e100_get_defaults(nic); 90 91 92 spin_lock_init(&nic->cb_lock); 93 spin_lock_init(&nic->cmd_lock); 94 95 96 e100_hw_reset(nic); 97 98 99 pci_set_master(pdev); 100 101 102 init_timer(&nic->watchdog); 103 nic->watchdog.function = e100_watchdog; 104 nic->watchdog.# = (unsigned long)nic; 105 init_timer(&nic->blink_timer); 106 nic->blink_timer.function = e100_blink_led; 107 nic->blink_timer.# = (unsigned long)nic; 108 109 INIT_WORK(&nic->tx_timeout_task, 110 (void (*)(void *))e100_tx_timeout_task, netdev); 111 112 if((err = e100_alloc(nic))) { 113 DPRINTK(PROBE, ERR, "Cannot alloc driver memory, aborting.\n"); 114 goto err_out_iounmap; 115 } 116 117 118 e100_phy_init(nic); 119 120 if((err = e100_eeprom_load(nic))) 121 goto err_out_free; 122 123 memcpy(netdev->dev_addr, nic->eeprom, ETH_ALEN); 124 if(!is_valid_ether_addr(netdev->dev_addr)) { 125 DPRINTK(PROBE, ERR, "Invalid MAC address from " 126 "EEPROM, aborting.\n"); 127 err = -EAGAIN; 128 goto err_out_free; 129 } 130 131 132 if((nic->mac >= mac_82558_D101_A4) && 133 (nic->eeprom[eeprom_id] & eeprom_id_wol)) 134 nic->flags |= wol_magic; 135 136 137 pci_enable_wake(pdev, 0, 0); 138 139 140 strcpy(netdev->name, "eth%d"); 141 if((err = register_netdev(netdev))) { 142 DPRINTK(PROBE, ERR, "Cannot register net device, aborting.\n"); 143 goto err_out_free; 144 } 145 146 DPRINTK(PROBE, INFO, "addr 0x%lx, irq %d, " 147 "MAC addr %02X:%02X:%02X:%02X:%02X:%02X\n", 148 pci_resource_start(pdev, 0), pdev->irq, 149 netdev->dev_addr[0], netdev->dev_addr[1], netdev->dev_addr[2], 150 netdev->dev_addr[3], netdev->dev_addr[4], netdev->dev_addr[5]); 151 152 return 0; 153 154 err_out_free: 155 e100_free(nic); 156 err_out_iounmap: 157 iounmap(nic->csr); 158 err_out_free_res: 159 pci_release_regions(pdev); 160 err_out_disable_pdev: 161 pci_disable_device(pdev); 162 err_out_free_dev: 163 pci_set_drv#(pdev, NULL); 164 free_netdev(netdev); 165 return err; 166 }
执行到这里,探测函数的使命就完成了,在对网络设备重要成员初始化时,有:
netdev->open = e100_open;
指定了设备的open函数为e100_open,这样,当第一次使用设备,比如使用ifconfig工具的时候,open函数将被调用。
二、打开设备
在探测函数中,设置了netdev->open = e100_open;指定了设备的open函数为e100_open:
1 static int e100_open(struct net_device *netdev) 2 { 3 struct nic *nic = netdev_priv(netdev); 4 int err = 0; 5 6 netif_carrier_off(netdev); 7 if((err = e100_up(nic))) 8 DPRINTK(IFUP, ERR, "Cannot open interface, aborting.\n"); 9 return err; 10 }
大多数涉及物理设备可以感知信号载波(carrier)的存在,载波的存在意味着设备可以工作
据个例子来讲:当一个用户拔掉了网线,也就意味着信号载波的消失。
netif_carrier_off:关闭载波信号;
netif_carrier_on:打开载波信号;
netif_carrier_ok:检测载波信号;
对于探测网卡网线是否连接,这一组函数被使用得较多;
接着,调用e100_up函数启动网卡,这个“启动”的过程,最重要的步骤有:
1、调用request_irq向内核注册中断;
2、调用netif_wake_queue函数来重新启动传输队例;
1 static int e100_up(struct nic *nic) 2 { 3 int err; 4 5 if((err = e100_rx_alloc_list(nic))) 6 return err; 7 if((err = e100_alloc_cbs(nic))) 8 goto err_rx_clean_list; 9 if((err = e100_hw_init(nic))) 10 goto err_clean_cbs; 11 e100_set_multicast_list(nic->netdev); 12 e100_start_receiver(nic, 0); 13 mod_timer(&nic->watchdog, jiffies); 14 if((err = request_irq(nic->pdev->irq, e100_intr, SA_SHIRQ, 15 nic->netdev->name, nic->netdev))) 16 goto err_no_irq; 17 netif_wake_queue(nic->netdev); 18 netif_poll_enable(nic->netdev); 19 20 e100_enable_irq(nic); 21 return 0; 22 23 err_no_irq: 24 del_timer_sync(&nic->watchdog); 25 err_clean_cbs: 26 e100_clean_cbs(nic); 27 err_rx_clean_list: 28 e100_rx_clean_list(nic); 29 return err; 30 31 32 }
这样,中断函数e100_intr将被调用
三、网卡中断
从本质上来讲,中断,是一种电信号,当设备有某种事件发生的时候,它就会产生中断,通过总线把电信号发送给中断控制器,如果中断的线是激活的,中断控制器就把电信号发送给处理器的某个特定引脚。处理器于是立即停止自己正在做的事,跳到内存中内核设置的中断处理程序的入口点,进行中断处理。
在内核中断处理中,会检测中断与我们刚才注册的中断号匹配,于是,注册的中断处理函数就被调用了。
当需要发/收数据,出现错误,连接状态变化等,网卡的中断信号会被触发。当接收到中断后,中断函数读取中断状态位,进行合法性判断,如判断中断信号是否是自己的等,然后,应答设备中断——OK,我已经知道了,你回去继续工作吧……
接着,它就屏蔽此中断,然后netif_rx_schedule函数接收,接收函数会在未来某一时刻调用设备的poll函数(对这里而言,注册的是e100_poll)实现设备的轮询
1 static irqreturn_t e100_intr(int irq, void *dev_id, struct pt_regs *regs) 2 { 3 struct net_device *netdev = dev_id; 4 struct nic *nic = netdev_priv(netdev); 5 u8 stat_ack = readb(&nic->csr->scb.stat_ack); 6 7 DPRINTK(INTR, DEBUG, "stat_ack = 0x%02X\n", stat_ack); 8 9 if(stat_ack == stat_ack_not_ours || 10 stat_ack == stat_ack_not_present) 11 return IRQ_NONE; 12 13 14 writeb(stat_ack, &nic->csr->scb.stat_ack); 15 16 17 if(stat_ack & stat_ack_rnr) 18 nic->ru_running = RU_SUSPENDED; 19 20 e100_disable_irq(nic); 21 netif_rx_schedule(netdev); 22 23 return IRQ_HANDLED; 24 }
对于数据包的接收而言,我们关注的是poll函数中,调用e100_rx_clean进行数据的接收
1 static int e100_poll(struct net_device *netdev, int *budget) 2 { 3 struct nic *nic = netdev_priv(netdev); 4 5 unsigned int work_to_do = min(netdev->quota, *budget); 6 unsigned int work_done = 0; 7 int tx_cleaned; 8 9 10 e100_rx_clean(nic, &work_done, work_to_do); 11 tx_cleaned = e100_tx_clean(nic); 12 13 14 15 if((!tx_cleaned && (work_done == 0)) || !netif_running(netdev)) { 16 netif_rx_complete(netdev); 17 e100_enable_irq(nic); 18 return 0; 19 } 20 21 *budget -= work_done; 22 netdev->quota -= work_done; 23 24 return 1; 25 } 26 27 28 static inline void e100_rx_clean(struct nic *nic, unsigned int *work_done, 29 unsigned int work_to_do) 30 { 31 struct rx *rx; 32 int restart_required = 0; 33 struct rx *rx_to_start = NULL; 34 35 36 if(RU_SUSPENDED == nic->ru_running) 37 restart_required = 1; 38 39 40 for(rx = nic->rx_to_clean; rx->skb; rx = nic->rx_to_clean = rx->next) { 41 int err = e100_rx_indicate(nic, rx, work_done, work_to_do); 42 if(-EAGAIN == err) { 43 44 restart_required = 0; 45 break; 46 } else if(-ENO# == err) 47 break; 48 } 49 50 51 if(restart_required) 52 rx_to_start = nic->rx_to_clean; 53 54 55 for(rx = nic->rx_to_use; !rx->skb; rx = nic->rx_to_use = rx->next) { 56 if(unlikely(e100_rx_alloc_skb(nic, rx))) 57 break; 58 } 59 60 if(restart_required) { 61 // ack the rnr? 62 writeb(stat_ack_rnr, &nic->csr->scb.stat_ack); 63 e100_start_receiver(nic, rx_to_start); 64 if(work_done) 65 (*work_done)++; 66 } 67 }
原文:http://www.cnblogs.com/listenerln/p/6393009.html