//位置:fs/Select.c
//作用:处理超时时间(如果存在的话)。将timeval转换为时钟周期,接着调用core_sys_select,最后使用剩余的时间(end_time)。
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timeval __user *tvp)
{
struct timespec end_time, *to = NULL;
struct timeval tv;
int ret;
//如果超时时间存在
if (tvp) {
if (copy_from_user(&tv, tvp, sizeof(tv))) //用户空间拷贝到内核空间
return -EFAULT;
to = &end_time; //获取剩余时间
if (poll_select_set_timeout(to,
tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
return -EINVAL;
}
//主要功能都在此函数中实现
ret = core_sys_select(n, inp, outp, exp, to);
//此函数会调用copy_to_user,拷贝到用户空间
ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
return ret;
}
前面主要是从用户空间拷贝到内核空间,具体工作在core_sys_select函数中实现,而真正的核心内容位于其中的do_select函数里。//位置:fs/Select.c
//作用:准备好fd位图,作为参数传入do_select,然后将其返回值,传递给用户空间(见前面一个函数的分析)
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timespec *end_time)
{
fd_set_bits fds;
void *bits;
int ret, max_fds;
unsigned int size;
struct fdtable *fdt;
//在栈上分配小块参数,以节省内存及提高速度。SELECT_STACK_ALLOC 定义为256
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
ret = -EINVAL;
if (n < 0)
goto out_nofds;
//max_fds是可以增长的,因此这里对其加锁以避免竞争
rcu_read_lock();
//获取当前进程的文件描述符表
//files_fdtable为宏定义,调用的是rcu_dereference,即内存屏障。
//current为全局静态变量(表示current node for SINGLE view)。
fdt = files_fdtable(current->files);
max_fds = fdt->max_fds;
rcu_read_unlock();
//如果传入的fd个数(即sys_select的第一个参数)超过了最大值,则修改为最大值。
if (n > max_fds)
n = max_fds;
/*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
* since we used fdset we need to allocate memory in units of
* long-words.
*/
//如果stack_fds数组的大小不能容纳下所有的fd_set,则使用kmalloc重新分配一个大的数组。
//然后将位图平均分配,并初始化fds结构体
size = FDS_BYTES(n); //计算存放n个long所需要的字节数
bits = stack_fds;
if (size > sizeof(stack_fds) / 6) {
// Not enough space in on-stack array; must use kmalloc
ret = -ENOMEM;
bits = kmalloc(6 * size, GFP_KERNEL);
if (!bits)
goto out_nofds;
}
fds.in = bits;
fds.out = bits + size;
fds.ex = bits + 2*size;
fds.res_in = bits + 3*size;
fds.res_out = bits + 4*size;
fds.res_ex = bits + 5*size;
//get_fd_set仅仅是调用copy_from_user, 将fd_set从用户空间拷贝到内核
if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
zero_fd_set(n, fds.res_in);
zero_fd_set(n, fds.res_out);
zero_fd_set(n, fds.res_ex);
//核心内容在do_select中实现。注意,fds传入的是引用,就是依靠它来返回处理结果的。
ret = do_select(n, &fds, end_time);
if (ret < 0)
goto out;
//do_select返回异常
if (!ret) {
//ERESTARTNOHAND会被转换为EINTR,表示系统调用被中断
ret = -ERESTARTNOHAND;
//如果当前进程有信号需要处理时,则返回true, 符合EINTR的处理机制
if (signal_pending(current))
goto out;
ret = 0;
}
//set_fd_set仅仅是调用copy_to_user, 将处理结果集(fds),拷贝回用户空间。
if (set_fd_set(n, inp, fds.res_in) ||
set_fd_set(n, outp, fds.res_out) ||
set_fd_set(n, exp, fds.res_ex))
ret = -EFAULT;
out:
if (bits != stack_fds)
kfree(bits); //对应前面的kmalloc
out_nofds:
return ret;
}
介绍完core_sys_select函数后,接下来就到了其真正处理select逻辑的核心函数do_select了。//位置:fs/Select.c
//作用:真正的select逻辑在此实现。遍历所有的fd,调用对应的xxx_poll函数(tcp_poll, udp_poll, datagram_poll等)
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table; //sys_poll的结构体
poll_table *wait;
int retval, i, timed_out = 0;
unsigned long slack = 0;
rcu_read_lock();
//根据已经打开fd的位图(fds)检查用户打开的fd, 要求对应fd必须打开, 并且返回最大的fd
retval = max_select_fd(n, fds);
rcu_read_unlock();
if (retval < 0) //如果没有打开的fd, 则直接返回了
return retval;
n = retval;
poll_initwait(&table); //初始化table
//将当前进程放入自已的等待队列table, 并将该等待队列加入到该测试表wait中
wait = &table.pt;
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
wait = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = estimate_accuracy(end_time);
retval = 0;
//这里是死循环
for (;;) {
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
set_current_state(TASK_INTERRUPTIBLE); //设置为可以中断的睡眠状态
inp = fds->in; outp = fds->out; exp = fds->ex;
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
//遍历所有的fd
for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
unsigned long in, out, ex, all_bits, bit = 1, mask, j;
unsigned long res_in = 0, res_out = 0, res_ex = 0;
const struct file_operations *f_op = NULL;
struct file *file = NULL;
in = *inp++; out = *outp++; ex = *exp++;
all_bits = in | out | ex;
//__NFDBITS是一个宏,定义为(8 * sizeof(unsigned long)),即一个long所代表的位数。
//因为位图是以long为单位的,所以跳至下一个位图需要__NFDBITS个比特。
if (all_bits == 0) {
i += __NFDBITS;
continue;
}
for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
int fput_needed;
if (i >= n)
break;
//从右至左测试all_bits中的每一位。如果当前bit是1,则继续下面操作。否则continue检测下一个。
if (!(bit & all_bits))
continue;
//轻量级的文件查找,得到file结构体指针,并增加它的引用计数字段f_count(加1)
file = fget_light(i, &fput_needed);
if (file) {
f_op = file->f_op;
mask = DEFAULT_POLLMASK;
//poll是一个函数指针。对于socket描述符,f_op->poll代表的函数就是sock_poll.
//函数的第二个参数是我们之前传递的等待队列,在poll成功后会将本进程唤醒执行
if (f_op && f_op->poll)
mask = (*f_op->poll)(file, retval ? NULL : wait);
//释放file结构体指针,实际上就是减小它的引用计数字段f_count(减1)
fput_light(file, fput_needed);
//根据poll返回的结果来设置状态。因为要返回select出来的fd数目,所以这里retval++。
//注意:retval是in, out, ex这三个集合的总和
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
}
if ((mask & POLLOUT_SET) && (out & bit)) {
res_out |= bit;
retval++;
}
if ((mask & POLLEX_SET) && (ex & bit)) {
res_ex |= bit;
retval++;
}
}
}
//将poll的结果写回到输出位图里
if (res_in)
*rinp = res_in;
if (res_out)
*routp = res_out;
if (res_ex)
*rexp = res_ex;
//注意前面的set_current_state(TASK_INTERRUPTIBLE)。因为已经进入了TASK_INTERRUPTIBLE状态,
//所以cond_resched会调度其他进程来运行,这里的目的纯粹是为了增加一个抢占点。被抢占后,由等待队列机制唤醒。
//这个函数具有主动被调度的作用。为了及时响应实时过程,需要中断线程化,而在中断线程化的过程中,
//需要调用cond_resched这个函数。在目前的内核代码中,一般在读磁盘前(或者其它可能费时操作前),会调用这个函数。
//在支持抢占式调度的内核中(定义了CONFIG_PREEMPT),cond_resched是空操作.
cond_resched();
}
wait = NULL;
//当前进程有信号要处理时,signal_pending返回true
if (retval || timed_out || signal_pending(current))
break;
if (table.error) {
retval = table.error;
break;
}
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec_to_ktime(*end_time);
to = &expire;
}
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
timed_out = 1;
}
//设置为运行状态
__set_current_state(TASK_RUNNING);
//清理等待队列
poll_freewait(&table);
return retval;
}
前面的这个函数代码很多,实际上最关键的一句就是://sock_fs_type定义为
static struct file_system_type sock_fs_type = {
.name = "sockfs",
.get_sb = sockfs_get_sb,
.kill_sb = kill_anon_super,
};
//而file_system_type定义在include/linux/Fs.h中
struct file_system_type {
const char *name;
int fs_flags;
int (*get_sb) (struct file_system_type *, int,
const char *, void *, struct vfsmount *);
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;
struct list_head fs_supers;
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
struct lock_class_key i_alloc_sem_key;
};
Socket文件系统的相关函数也在net/Socket.c里,如下所示:/*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
* in the operation structures but are done directly via the socketcall() multiplexor.
*/
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read,
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
接下来看看sock_poll的实现:/* No kernel lock held - perfect */
static unsigned int sock_poll(struct file *file, poll_table *wait)
{
struct socket *sock;
/*
* We can't return errors to poll, so it's either yes or no.
*/
sock = file->private_data; //约定socket的file->private_data字段放着对应的socket结构指针
//对应了三个协议的函数tcp_poll,udp_poll,datagram_poll,其中udp_poll几乎直接调用了datagram_poll
return sock->ops->poll(file, sock, wait);
} 以tcp_poll为例,代码位于net/ipv4/Tcp.c中。static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p);3) 遍历所有的fd, 调用其对应的poll方法(对于socket文件系统, 对应方法为sock_poll。其会根据具体协议,分别调用tcp_poll, udp_poll及datagram_poll);
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) {
unsigned int mask;
struct sock *sk = sock->sk;
struct tcp_sock *tp = tcp_sk(sk);
poll_wait(file, sk->sk_sleep, wait);
...............
} 5) poll方法(sock_poll)返回时会返回一个描述读写操作是否就绪的mask掩码,根据这个mask掩码给fd_set赋值。原文:http://blog.csdn.net/shltsh/article/details/39349433