【块设备驱动程序】
Linux系统主要有字符设备、网络设备、块设备,Linux内核中,I/O设备分为两大类:字符设备、块设备。块设备将数据存储在固定大小的块中,每个块都有自己的地址。数据块的大小通常在512字节到4K字节之间。
块设备与文件系统的关系如图所示:
块设备的结构:扇区,磁道,柱面,盘片。其中扇区是硬件设备传送数据的基本单位,其大小一般为512字节,也有更大的512*n字节的。但在Linux内核中逻辑扇区的大小历来固定大小512字节。
内存是一个线性结构,Linux系统将内存分为页,页的大小从4K到64K,当在内存和磁盘间传送数据时,先将页内的数据封装成一个段,内核以段为基本单位来读写磁盘。段用bio_vec表示,多个页被封装成多个段,多个段组成一个以bio_vec为元素的数组bi_io_vec。bi_io_vec是块I/O结构bio结构体中的一个指针,多个bio组成一个request,request将被连接到请求队列request_queue中。最后这个请求队列将被处理,将数据写到磁盘。关系如下图所示:
总结:扇区(512) <= 块 <= 段 <= 页(4096),且 块= n * 扇区, 段 = n * 块。
块设备驱动的架构:
块设备加载过程:
分配磁盘alloc_disk() -----> 注册设备register_blkdev() -----> 不使用请求队列blk_init_queue()【使用请求队列blk_alloc_queue()】 -----> 磁盘gendisk属性的设置 -----> 激活磁盘add_disk()。
块设备卸载过程:
删除gendisk del_gendisk() -----> 删除gendisk的引用 put_disk() -----> 清除请求队列 blk_cleanup_queue() -----> 注销块设备 unregister_blkdev()。
通用块层是块设备驱动的核心部分,主要包括了块设备驱动程序的通用代码部分。
其中块设备加载过程用到的通用块层数据结构有: gendisk 、 request_queue 、 request 、bio 、 block_device_operations等。
在Linux内核中,gendisk表示一个磁盘,也可以表示一个分区。
113 struct gendisk {
114 int major; /* major number of driver */
115 int first_minor;
116 int minors; /* maximum number of minors, =1 for
117 * disks that can‘t be partitioned. */
118 char disk_name[32]; /* name of major driver */
119 struct hd_struct **part; /* [indexed by minor] */
120 int part_uevent_suppress;
121 struct block_device_operations *fops;
122 struct request_queue *queue;
123 void *private_data;
124 sector_t capacity;
125
126 int flags;
127 struct device *driverfs_dev;
128 struct kobject kobj;
129 struct kobject *holder_dir;
130 struct kobject *slave_dir;
131
132 struct timer_rand_state *random;
133 int policy;
134
135 atomic_t sync_io; /* RAID */
136 unsigned long stamp;
137 int in_flight;
138 #ifdef CONFIG_SMP
139 struct disk_stats *dkstats;
140 #else
141 struct disk_stats dkstats;
142 #endif
143 struct work_struct async_notify;
144 };
gendisk是一个动态的结构体,其成员随系统状态不断变化,所以不能静态分配该结构,内核提供专用函数alloc_disk()来分配该结构体。
697 struct gendisk *alloc_disk(int minors)
698 {
699 return alloc_disk_node(minors, -1);
700 }
701
702 struct gendisk *alloc_disk_node(int minors, int node_id)
703 {
704 struct gendisk *disk;
705
706 disk = kmalloc_node(sizeof(struct gendisk),
707 GFP_KERNEL | __GFP_ZERO, node_id);
708 if (disk) {
709 if (!init_disk_stats(disk)) {
710 kfree(disk);
711 return NULL;
712 }
713 if (minors > 1) {
714 int size = (minors - 1) * sizeof(struct hd_struct *);
715 disk->part = kmalloc_node(size,
716 GFP_KERNEL | __GFP_ZERO, node_id);
717 if (!disk->part) {
718 free_disk_stats(disk);
719 kfree(disk);
720 return NULL;
721 }
722 }
723 disk->minors = minors;
724 kobj_set_kset_s(disk,block_subsys);
725 kobject_init(&disk->kobj);
726 rand_initialize_disk(disk);
727 INIT_WORK(&disk->async_notify,
728 media_change_notify_thread);
729 }
730 return disk;
731 }使用alloc_disk分配了一个disk后,需要设置gendisk的属性:
strcpy(xxx_disk->disk_name, xxx_DISKNAME);//设定设备名字 xxx.disk->major = xxx_MAJOR;//设置主设备号 xxx.disk->first_minor = 0;//设置次设备号 xxx.disk->fops = &xxx_fops;//指定块设备操作函数 xxx.disk->queue = xxx_queue;//设置请求队列 set_capacity(xxx.disk, xxx_BYTES>>9);//设置设备容量之后,就可以用add_disk函数向系统激活这个磁盘设备了:
178 void add_disk(struct gendisk *disk)
179 {
180 disk->flags |= GENHD_FL_UP;
181 blk_register_region(MKDEV(disk->major, disk->first_minor),
182 disk->minors, NULL, exact_match, exact_lock, disk);
183 register_disk(disk);
184 blk_register_queue(disk);
185 }当不需要磁盘时,应当使用del_gendisk函数删除gendisk结构体,之后还应使用put_disk函数减少gendisk的引用计数。再之后就是使用unregister_blkdev函数注销块设备。
1118 struct block_device_operations {
1119 int (*open) (struct inode *, struct file *);
1120 int (*release) (struct inode *, struct file *);
1121 int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
1122 long (*unlocked_ioctl) (struct file *, unsigned, unsigned long);
1123 long (*compat_ioctl) (struct file *, unsigned, unsigned long);
1124 int (*direct_access) (struct block_device *, sector_t, unsigned long *);
1125 int (*media_changed) (struct gendisk *);
1126 int (*revalidate_disk) (struct gendisk *);
1127 int (*getgeo)(struct block_device *, struct hd_geometry *);
1128 struct module *owner;
1129 };数据从内存到磁盘或者从磁盘到内存的过程叫做I/O操作。内核使用bio结构体来描述I/O操作,bio结构体包含一个块设备完成一次I/O操作所需的一切信息。可以将bio理解为描述内存中连续几页的数据,每页中的数据由一个段bio_vec表示,所以几页中的数据就组成一个bi_io_vec的数组。
74 struct bio {
75 sector_t bi_sector; /* device address in 512 byte
76 sectors */
77 struct bio *bi_next; /* request queue link */
78 struct block_device *bi_bdev;
79 unsigned long bi_flags; /* status, command, etc */
80 unsigned long bi_rw; /* bottom bits READ/WRITE,
81 * top bits priority
82 */
83
84 unsigned short bi_vcnt; /* how many bio_vec‘s */
85 unsigned short bi_idx; /* current index into bvl_vec */
86
87 /* Number of segments in this BIO after
88 * physical address coalescing is performed.
89 */
90 unsigned short bi_phys_segments;
91
92 /* Number of segments after physical and DMA remapping
93 * hardware coalescing is performed.
94 */
95 unsigned short bi_hw_segments;
96
97 unsigned int bi_size; /* residual I/O count */
98
99 /*
100 * To keep track of the max hw size, we account for the
101 * sizes of the first and last virtually mergeable segments
102 * in this bio
103 */
104 unsigned int bi_hw_front_size;
105 unsigned int bi_hw_back_size;
106
107 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
108
109 struct bio_vec *bi_io_vec; /* the actual vec list */
110
111 bio_end_io_t *bi_end_io;
112 atomic_t bi_cnt; /* pin count */
113
114 void *bi_private;
115
116 bio_destructor_t *bi_destructor; /* destructor */
117 };bio中段用bio_vec来表示:
58 */
59 struct bio_vec {
60 struct page *bv_page;
61 unsigned int bv_len;
62 unsigned int bv_offset;
63 };几个连续的页组成一个bio结构,几个相邻的bio结构组成一个请求结构request:218 struct request {
219 struct list_head queuelist;
220 struct list_head donelist;
221
222 struct request_queue *q;
223
224 unsigned int cmd_flags;
225 enum rq_cmd_type_bits cmd_type;
226
227 /* Maintain bio traversal state for part by part I/O submission.
228 * hard_* are block layer internals, no driver should touch them!
229 */
230
231 sector_t sector; /* next sector to submit */
232 sector_t hard_sector; /* next sector to complete */
233 unsigned long nr_sectors; /* no. of sectors left to submit */
234 unsigned long hard_nr_sectors; /* no. of sectors left to complete */
235 /* no. of sectors left to submit in the current segment */
236 unsigned int current_nr_sectors;
237
238 /* no. of sectors left to complete in the current segment */
239 unsigned int hard_cur_sectors;
240
241 struct bio *bio;
242 struct bio *biotail;
243
244 struct hlist_node hash; /* merge hash */
245 /*
246 * The rb_node is only used inside the io scheduler, requests
247 * are pruned when moved to the dispatch queue. So let the
248 * completion_data share space with the rb_node.
249 */
250 union {
251 struct rb_node rb_node; /* sort/lookup */
252 void *completion_data;
253 };
254
255 /*
256 * two pointers are available for the IO schedulers, if they need
257 * more they have to dynamically allocate it.
258 */
259 void *elevator_private;
260 void *elevator_private2;
261
262 struct gendisk *rq_disk;
263 unsigned long start_time;
264
265 /* Number of scatter-gather DMA addr+len pairs after
266 * physical address coalescing is performed.
267 */
268 unsigned short nr_phys_segments;
269
270 /* Number of scatter-gather addr+len pairs after
271 * physical and DMA remapping hardware coalescing is performed.
272 * This is the number of scatter-gather entries the driver
273 * will actually have to deal with after DMA mapping is done.
274 */
275 unsigned short nr_hw_segments;
276
277 unsigned short ioprio;
278
279 void *special;
280 char *buffer;
281
282 int tag;
283 int errors;
284
285 int ref_count;
286
287 /*
288 * when request is used as a packet command carrier
289 */
290 unsigned int cmd_len;
291 unsigned char cmd[BLK_MAX_CDB];
292
293 unsigned int data_len;
294 unsigned int sense_len;
295 void *data;
296 void *sense;
297
298 unsigned int timeout;
299 int retries;
300
301 /*
302 * completion callback.
303 */
304 rq_end_io_fn *end_io;
305 void *end_io_data;
306
307 /* for bidi */
308 struct request *next_rq;
309 };每一个request中包含了多个bio结构体。每个块设备驱动程序都维护者自己的请求队列request_queue,请求队列主要用来连接多个request请求结构。内核将请求队列设计成一个双向链表,它的每个元素都是一个请求结构request。其中还包括对request结构排序算法的指定。
350 struct request_queue
351 {
352 /*
353 * Together with queue_head for cacheline sharing
354 */
355 struct list_head queue_head;
356 struct request *last_merge;
357 elevator_t *elevator;
358
359 /*
360 * the queue request freelist, one for reads and one for writes
361 */
362 struct request_list rq;
363
364 request_fn_proc *request_fn;
365 make_request_fn *make_request_fn;
366 prep_rq_fn *prep_rq_fn;
367 unplug_fn *unplug_fn;
368 merge_bvec_fn *merge_bvec_fn;
369 prepare_flush_fn *prepare_flush_fn;
370 softirq_done_fn *softirq_done_fn;
371
372 /*
373 * Dispatch queue sorting
374 */
375 sector_t end_sector;
376 struct request *boundary_rq;
377
378 /*
379 * Auto-unplugging state
380 */
381 struct timer_list unplug_timer;
382 int unplug_thresh; /* After this many requests */
383 unsigned long unplug_delay; /* After this many jiffies */
384 struct work_struct unplug_work;
385
386 struct backing_dev_info backing_dev_info;
387
388 /*
389 * The queue owner gets to use this for whatever they like.
390 * ll_rw_blk doesn‘t touch it.
391 */
392 void *queuedata;
393
394 /*
395 * queue needs bounce pages for pages above this limit
396 */
397 unsigned long bounce_pfn;
398 gfp_t bounce_gfp;
399
400 /*
401 * various queue flags, see QUEUE_* below
402 */
403 unsigned long queue_flags;
404
405 /*
406 * protects queue structures from reentrancy. ->__queue_lock should
407 * _never_ be used directly, it is queue private. always use
408 * ->queue_lock.
409 */
410 spinlock_t __queue_lock;
411 spinlock_t *queue_lock;
412
413 /*
414 * queue kobject
415 */
416 struct kobject kobj;
417
418 /*
419 * queue settings
420 */
421 unsigned long nr_requests; /* Max # of requests */
422 unsigned int nr_congestion_on;
423 unsigned int nr_congestion_off;
424 unsigned int nr_batching;
425
426 unsigned int max_sectors;
427 unsigned int max_hw_sectors;
428 unsigned short max_phys_segments;
429 unsigned short max_hw_segments;
430 unsigned short hardsect_size;
431 unsigned int max_segment_size;
432
433 unsigned long seg_boundary_mask;
434 unsigned int dma_alignment;
435
436 struct blk_queue_tag *queue_tags;
437 struct list_head tag_busy_list;
438
439 unsigned int nr_sorted;
440 unsigned int in_flight;
441
442 /*
443 * sg stuff
444 */
445 unsigned int sg_timeout;
446 unsigned int sg_reserved_size;
447 int node;
448 #ifdef CONFIG_BLK_DEV_IO_TRACE
449 struct blk_trace *blk_trace;
450 #endif
451 /*
452 * reserved for flush operations
453 */
454 unsigned int ordered, next_ordered, ordseq;
455 int orderr, ordcolor;
456 struct request pre_flush_rq, bar_rq, post_flush_rq;
457 struct request *orig_bar_rq;
458
459 struct mutex sysfs_lock;
460
461 #if defined(CONFIG_BLK_DEV_BSG)
462 struct bsg_class_device bsg_dev;
463 #endif
464 };
请求队列、请求结构和bio之间的关系如图所示:
具体块设备驱动程序可参考Linux源码http://lxr.free-electrons.com/source/drivers/block/nbd.c?v=2.6.24
Linux驱动开发学习归纳-4,布布扣,bubuko.com
原文:http://blog.csdn.net/qiaojianqj/article/details/23268905