首页 > 其他 > 详细

Socket与系统调用深度分析

时间:2019-12-19 21:21:48      阅读:73      评论:0      收藏:0      [点我收藏+]

Socket与系统调用深度分析

1. 系统调用是什么
从用户应用程序的角度来看,内核是一个透明的系统层,它一直存在,但是从未真正的被注意到。进程是不知道内核的工作内容的。比如,进程不知道数据的真实物理地址,哪些数据已经被换入或换出。但是不可否认的是,进程在执行的过程中,或多或少的在与内核交互,请求内存资源、访问外设、与其它进程通信等等。为了达到这些目的,进程使用标准C库,C库里的函数调用内核函数,最终由内核负责在各个请求进程之间不失公平的进行处理。
因此,应用程序看到的内核是负责执行各种系统功能的函数集合。标准C库只是一个中间层,用于在不同的体系结构和系统之间,标准化并简化内核调用方式。
最后,大致应该明白了:系统调用是操作系统提供给应用程序访问系统资源的接口,应用程序是通过这个接口来获得操作系统的服务的,比如打开文件,读文件等。
2. 内核态与用户态
在上面我们知道,应用程序要访问系统资源,必须通过系统调用。但是这里有一个问题,为什么要这么做?如果考虑到程序的执行效率,为什么不能由应用程序直接访问系统资源呢,非要间接的通过系统调用来实现?这里就涉及到linux系统的用户态和内核态概念了。
之所以要有内核态和用户态的区分,最直接的理由就是系统安全。你想想,如果用户程序能够直接访问硬件,万一操作不当,就可能面临系统崩溃的局面。刚买的一台电脑,你刚写个程序,一运行,电脑就崩,这样的代价太大了。所以,必须要区分内核态和用户态。其他的理由请自行百度。
这里说明一点:应用程序运行在用户态,当发生系统调用时,系统会自动陷入内核态。当系统调用处理完成后,再回到用户态。在x86体系中,应用程序是通过int 0x80实现从用户态到内核态的转换的。
3. socket数据结构
<net.h>

struct socket {
    socket_state        state;
    kmemcheck_bitfield_begin(type);
    short           type;
    kmemcheck_bitfield_end(type);
    unsigned long       flags;
    struct socket_wq __rcu  *wq;
    struct file     *file;
    struct sock     *sk;
    const struct proto_ops  *ops;
};

socket的定义并未绑定具体的协议内容,这也说明了为什么需要proto_ops指针指向一个数据结构。其中包含了用于处理套接字的特定协议的函数:
<net.h>

    struct proto_ops {
    int     family;
    struct module   *owner;
    int     (*release)   (struct socket *sock);
    int     (*bind)      (struct socket *sock,
                      struct sockaddr *myaddr,
                      int sockaddr_len);
    int     (*connect)   (struct socket *sock,
                      struct sockaddr *vaddr,
                      int sockaddr_len, int flags);
    int     (*socketpair)(struct socket *sock1,
                      struct socket *sock2);
    int     (*accept)    (struct socket *sock,
                      struct socket *newsock, int flags);
    int     (*getname)   (struct socket *sock,
                      struct sockaddr *addr,
                      int *sockaddr_len, int peer);
    unsigned int    (*poll)      (struct file *file, struct socket *sock,
                      struct poll_table_struct *wait);
    int     (*ioctl)     (struct socket *sock, unsigned int cmd,
                      unsigned long arg);
#ifdef CONFIG_COMPAT
    int     (*compat_ioctl) (struct socket *sock, unsigned int cmd,
                      unsigned long arg);
#endif
    int     (*listen)    (struct socket *sock, int len);
    int     (*shutdown)  (struct socket *sock, int flags);
    int     (*setsockopt)(struct socket *sock, int level,
                      int optname, char __user *optval, unsigned int optlen);
    int     (*getsockopt)(struct socket *sock, int level,
                      int optname, char __user *optval, int __user *optlen);
#ifdef CONFIG_COMPAT
    int     (*compat_setsockopt)(struct socket *sock, int level,
                      int optname, char __user *optval, unsigned int optlen);
    int     (*compat_getsockopt)(struct socket *sock, int level,
                      int optname, char __user *optval, int __user *optlen);
#endif
    int     (*sendmsg)   (struct kiocb *iocb, struct socket *sock,
                      struct msghdr *m, size_t total_len);
    int     (*recvmsg)   (struct kiocb *iocb, struct socket *sock,
                      struct msghdr *m, size_t total_len,
                      int flags);
    int     (*mmap)      (struct file *file, struct socket *sock,
                      struct vm_area_struct * vma);
    ssize_t     (*sendpage)  (struct socket *sock, struct page *page,
                      int offset, size_t size, int flags);
    ssize_t     (*splice_read)(struct socket *sock,  loff_t *ppos,
                       struct pipe_inode_info *pipe, size_t len, unsigned int flags);
};

许多函数指针都与C标准库函数同名。这不是巧合。因为C库函数会通过socketcall系统调用导向上述的函数指针。
4. socket与文件
在建立连接之后,用户空间进程使用普通的文件操作来访问套接字。这在内核中是如何实现的呢?这就多亏了VFS层的开放结构,只需要做很少的工作。因为对套接字文件描述符的文件操作,可以透明的重定向到网络子系统的代码中。套接字使用的file_operations 结构如下
<net/socket.c>

static const struct file_operations socket_file_ops = {
    .owner =    THIS_MODULE,
    .llseek =   no_llseek,
    .aio_read = sock_aio_read,
    .aio_write =    sock_aio_write,
    .poll =     sock_poll,
    .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl = compat_sock_ioctl,
#endif
    .mmap =     sock_mmap,
    .open =     sock_no_open,   /* special open code to disallow open via /proc */
    .release =  sock_close,
    .fasync =   sock_fasync,
    .sendpage = sock_sendpage,
    .splice_write = generic_splice_sendpage,
    .splice_read =  sock_splice_read,
};

5. socketcall系统调用
linux提供了socketcall系统调用,它实现在sys_socketcall中。

#ifdef __ARCH_WANT_SYS_SOCKETCALL
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[21] = {
    AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
    AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
    AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
    AL(4), AL(5), AL(4)
};
#undef AL

sys_socketcall的任务其实并不困难,它充当“socket多路分配器”,将系统调用转到其他具体的函数执行,并传递参数,后者中的每个函数都实现了一个“更小”的系统调用

/*
 *  System call vectors.
 *
 *  Argument checking cleaned up. Saved 20% in size.
 *  This function doesn't need to set the kernel lock because
 *  it is set by the callees.
 */

SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
    unsigned long a[6];
    unsigned long a0, a1;
    int err;
    unsigned int len;

    if (call < 1 || call > SYS_SENDMMSG)
        return -EINVAL;

    len = nargs[call];
    if (len > sizeof(a))
        return -EINVAL;

    /* copy_from_user should be SMP safe. */
    if (copy_from_user(a, args, len))
        return -EFAULT;

    audit_socketcall(nargs[call] / sizeof(unsigned long), a);

    a0 = a[0];
    a1 = a[1];

    switch (call) {
    case SYS_SOCKET:
        err = sys_socket(a0, a1, a[2]);
        break;
    case SYS_BIND:
        err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_CONNECT:
        err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_LISTEN:
        err = sys_listen(a0, a1);
        break;
    case SYS_ACCEPT:
        err = sys_accept4(a0, (struct sockaddr __user *)a1,
                  (int __user *)a[2], 0);
        break;
    case SYS_GETSOCKNAME:
        err =
            sys_getsockname(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2]);
        break;
    case SYS_GETPEERNAME:
        err =
            sys_getpeername(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2]);
        break;
    case SYS_SOCKETPAIR:
        err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
        break;
    case SYS_SEND:
        err = sys_send(a0, (void __user *)a1, a[2], a[3]);
        break;
    case SYS_SENDTO:
        err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
                 (struct sockaddr __user *)a[4], a[5]);
        break;
    case SYS_RECV:
        err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
        break;
    case SYS_RECVFROM:
        err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                   (struct sockaddr __user *)a[4],
                   (int __user *)a[5]);
        break;
    case SYS_SHUTDOWN:
        err = sys_shutdown(a0, a1);
        break;
    case SYS_SETSOCKOPT:
        err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
        break;
    case SYS_GETSOCKOPT:
        err =
            sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                   (int __user *)a[4]);
        break;
    case SYS_SENDMSG:
        err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
        break;
    case SYS_SENDMMSG:
        err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);
        break;
    case SYS_RECVMSG:
        err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
        break;
    case SYS_RECVMMSG:
        err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
                   (struct timespec __user *)a[4]);
        break;
    case SYS_ACCEPT4:
        err = sys_accept4(a0, (struct sockaddr __user *)a1,
                  (int __user *)a[2], a[3]);
        break;
    default:
        err = -EINVAL;
        break;
    }
    return err;
}

可以看到,17个套接字操作只是对应这一个系统调用总入口,这是比较引人注目的。
6. 服务器端代码

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <errno.h>
#include <string.h>
#include <arpa/inet.h>
#include <netinet/in.h>

#define N 64

int main( int argc, char *argv[] )
{
    int listenfd, connfd;
    struct sockaddr_in myaddr, peeraddr;
    socklen_t len;
    char buf[N] = {0};
    ssize_t n;

    if(argc < 3)
    {
        printf("Usage: %s ip port\n", argv[0]);
        return 0;
    }

    if((listenfd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
    {
        perror("socket");
        exit(-1);
    }

    memset(&myaddr, 0, sizeof(myaddr));
    myaddr.sin_family = AF_INET;
    myaddr.sin_port = htons(atoi(argv[2]));
    myaddr.sin_addr.s_addr = inet_addr(argv[1]);

    if(bind(listenfd, (struct sockaddr *) &myaddr, sizeof(myaddr)) == -1)
    {
        perror("bind");
        exit(-1);
    }

    if(-1 == listen(listenfd, 5))
    {
        perror("listen");
        exit(-1);
    }


    memset(&peeraddr, 0, sizeof(peeraddr));
    len = sizeof(peeraddr);

    while(1)
    {
        if((connfd = accept(listenfd, (struct sockaddr *)&peeraddr, &len)) == -1)
        {
            perror("accept");
            exit(-1);
        }

        printf("from %s:%d\n", inet_ntoa(peeraddr.sin_addr), ntohs(peeraddr.sin_port));

        while(1)
        {
            memset(buf, 0, sizeof(buf));
            n = recv(connfd, buf, N, 0);
            if( n == 0 )
            {
                break;
            }

            buf[n] = '\0';
            printf("n=%d %s", n, buf);

            send(connfd, buf, n, 0);
        }
        close(connfd);
    }

    return 0;
}

7. 客户端代码

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <errno.h>
#include <string.h>
#include <arpa/inet.h>
#include <netinet/in.h>

#define N 64

int main( int argc, char *argv[] )
{
    int sockfd;
    struct sockaddr_in servaddr, myaddr;
    char buf[N] = {0};

    if (argc < 3)
    {
        printf("usage : %s ip port\n", argv[0]);
        return 0;
    }

    if((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
    {
        perror("socket");
        exit(-1);
    }


#if 0
    memset(&myaddr, 0, sizeof(myaddr));
    myaddr.sin_family = AF_INET;
    myaddr.sin_port = htons(8000);
    myaddr.sin_addr.s_addr = inet_addr(argv[1]);

    if(bind(sockfd, (struct sockaddr *) &myaddr, sizeof(myaddr)) == -1)
    {
        error("bind");
        exit(-1);
    }

#endif

    memset(&servaddr, 0, sizeof(servaddr));
    servaddr.sin_family = AF_INET;
    servaddr.sin_port = htons(atoi(argv[2]));
    servaddr.sin_addr.s_addr = inet_addr(argv[1]);

    if(connect(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) == -1)
    {
        perror("socket");
        exit(-1);
    }


    printf(">");
    while(fgets(buf, N, stdin) != NULL)
    {
        send(sockfd, buf, strlen(buf), 0);
        memset(buf, 0, sizeof(buf));
        recv(sockfd, buf, N, 0);
        printf("%s\n", buf);

        printf(">");
    }

    close(sockfd);

    return 0;
}

执行结果
服务器端:

linuxclass@ubuntu:~/qemulinux/net_ws/tcp$ strace ./server 127.0.0.1 12334
execve("./server", ["./server", "127.0.0.1", "12334"], [/* 60 vars */]) = 0
brk(0)                                  = 0xbf4000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=95728, ...}) = 0
mmap(NULL, 95728, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7ff559b32000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0P \2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1857312, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ff559b31000
mmap(NULL, 3965632, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7ff55955f000
mprotect(0x7ff55971d000, 2097152, PROT_NONE) = 0
mmap(0x7ff55991d000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1be000) = 0x7ff55991d000
mmap(0x7ff559923000, 17088, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7ff559923000
close(3)                                = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ff559b2f000
arch_prctl(ARCH_SET_FS, 0x7ff559b2f740) = 0
mprotect(0x7ff55991d000, 16384, PROT_READ) = 0
mprotect(0x601000, 4096, PROT_READ)     = 0
mprotect(0x7ff559b4a000, 4096, PROT_READ) = 0
munmap(0x7ff559b32000, 95728)           = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3
bind(3, {sa_family=AF_INET, sin_port=htons(12334), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
listen(3, 5)                            = 0
accept(3, {sa_family=AF_INET, sin_port=htons(56512), sin_addr=inet_addr("127.0.0.1")}, [16]) = 4
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 9), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ff559b49000
write(1, "from 127.0.0.1:56512\n", 21from 127.0.0.1:56512
)  = 21

可以看出,在跟踪系统调用的过程中,依次调用了socket-->bind-->listen-->accept--write
客户端:

linuxclass@ubuntu:~/qemulinux/net_ws/tcp$ strace ./client 127.0.0.1 12334
execve("./client", ["./client", "127.0.0.1", "12334"], [/* 60 vars */]) = 0
brk(0)                                  = 0xa8c000
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=95728, ...}) = 0
mmap(NULL, 95728, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f379fbbd000
close(3)                                = 0
access("/etc/ld.so.nohwcap", F_OK)      = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0P \2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1857312, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f379fbbc000
mmap(NULL, 3965632, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f379f5ea000
mprotect(0x7f379f7a8000, 2097152, PROT_NONE) = 0
mmap(0x7f379f9a8000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1be000) = 0x7f379f9a8000
mmap(0x7f379f9ae000, 17088, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f379f9ae000
close(3)                                = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f379fbba000
arch_prctl(ARCH_SET_FS, 0x7f379fbba740) = 0
mprotect(0x7f379f9a8000, 16384, PROT_READ) = 0
mprotect(0x600000, 4096, PROT_READ)     = 0
mprotect(0x7f379fbd5000, 4096, PROT_READ) = 0
munmap(0x7f379fbbd000, 95728)           = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3
connect(3, {sa_family=AF_INET, sin_port=htons(12334), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f379fbd4000
fstat(0, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f379fbd3000
write(1, ">", 1>)                        = 1
read(0, 

可以看出,在跟踪系统调用的过程中,依次调用了socket-->connect-->write
8. 总结
自己对socket的一点理解,谢谢!

Socket与系统调用深度分析

原文:https://www.cnblogs.com/ustc-hwq/p/12069661.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!