许多文件系统都是通过generic_file_write()函数来实现文件对象的write方法,即write(库函数)->sys_write()->generic_file_write():
ssize_t generic_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; down(&inode->i_sem); ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; err = sync_page_range(inode, mapping, *ppos - ret, ret); if (err < 0) ret = err; } return ret; }
ssize_t __generic_file_write_nolock(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, file); ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); return ret; }
ssize_t __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; unsigned long seg; loff_t pos; ssize_t written; ssize_t err; ocount = 0; for (seg = 0; seg < nr_segs; seg++) { const struct iovec *iv = &iov[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ ocount += iv->iov_len; if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) return -EINVAL; if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) continue; if (seg == 0) return -EFAULT; nr_segs = seg; ocount -= iv->iov_len; /* This segment is no good */ break; } count = ocount; pos = *ppos; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; written = 0; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; if (count == 0) goto out; err = remove_suid(file->f_dentry); if (err) goto out; inode_update_time(inode, 1); /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, count, ocount); if (written < 0 || written == count) goto out; /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ pos += written; count -= written; } written = generic_file_buffered_write(iocb, iov, nr_segs, pos, ppos, count, written); out: current->backing_dev_info = NULL; return written ? written : err; }
ssize_t generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, loff_t *ppos, size_t count, ssize_t written) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; struct address_space_operations *a_ops = mapping->a_ops; struct inode *inode = mapping->host; long status = 0; struct page *page; struct page *cached_page = NULL; size_t bytes; struct pagevec lru_pvec; const struct iovec *cur_iov = iov; /* current iovec */ size_t iov_base = 0; /* offset in the current iovec */ char __user *buf; pagevec_init(&lru_pvec, 0); buf = iov->iov_base + written; /* handle partial DIO write */ do { unsigned long index; unsigned long offset; size_t copied; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ //获取要写的缓冲页面索引(如何根据页索引在radix树中获取到指定页描述符,ULK-PAGE600) index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; //最后剩一点写入内容的处理 if (bytes > count) bytes = count; /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. */ fault_in_pages_readable(buf, bytes); //在radix树里面查找要被写的page,如果不存在则创建一个,见下面分析 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { status = -ENOMEM; break; } //为这个page准备一组buffer_head结构,用于描述组成这个page的数据块,见下面分析 status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { loff_t isize = i_size_read(inode); /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. */ unlock_page(page); page_cache_release(page); if (pos + bytes > isize) vmtruncate(inode, isize); break; } if (likely(nr_segs == 1)) copied = (page, offset, buf, bytes); else copied = filemap_copy_from_user_iovec(page, offset, cur_iov, iov_base, bytes); flush_dcache_page(page); //把基础缓冲区标记为脏,以便随后把他们都写到磁盘。 status = a_ops->commit_write(file, page, offset, offset+bytes); if (likely(copied > 0)) { if (!status) status = copied; if (status >= 0) { written += status; count -= status; pos += status; buf += status; if (unlikely(nr_segs > 1)) filemap_set_next_iovec(&cur_iov, &iov_base, status); } } if (unlikely(copied != bytes)) if (status >= 0) status = -EFAULT; unlock_page(page); mark_page_accessed(page); page_cache_release(page); if (status < 0) break; balance_dirty_pages_ratelimited(mapping); cond_resched(); } while (count); *ppos = pos; if (cached_page) page_cache_release(cached_page); /* * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC */ if (likely(status >= 0)) { if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { if (!a_ops->writepage || !is_sync_kiocb(iocb)) status = generic_osync_inode(inode, mapping, OSYNC_METADATA|OSYNC_DATA); } } /* * If we get here for O_DIRECT writes then we must have fallen through * to buffered writes (block instantiation inside i_size). So we sync * the file data here, to try to honour O_DIRECT expectations. */ if (unlikely(file->f_flags & O_DIRECT) && written) status = filemap_write_and_wait(mapping); pagevec_lru_add(&lru_pvec); return written ? written : status; } static inline struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page, struct pagevec *lru_pvec) { int err; struct page *page; repeat: //根据address_space地址和缓冲页的索引,获取缓冲页面的描述符(ULK-PAGE602) page = find_lock_page(mapping, index); if (!page) { if (!*cached_page) { *cached_page = page_cache_alloc(mapping); if (!*cached_page) return NULL; } //把一个新页的描述符插入到页高速缓存--在radix树中出入新节点 err = add_to_page_cache(*cached_page, mapping, index, GFP_KERNEL); if (err == -EEXIST) goto repeat; if (err == 0) { page = *cached_page; page_cache_get(page); if (!pagevec_add(lru_pvec, page)) __pagevec_lru_add(lru_pvec); *cached_page = NULL; } } return page; }
原文:http://blog.csdn.net/getnextwindow/article/details/30996385