1. 系统调用是什么
从用户应用程序的角度来看,内核是一个透明的系统层,它一直存在,但是从未真正的被注意到。进程是不知道内核的工作内容的。比如,进程不知道数据的真实物理地址,哪些数据已经被换入或换出。但是不可否认的是,进程在执行的过程中,或多或少的在与内核交互,请求内存资源、访问外设、与其它进程通信等等。为了达到这些目的,进程使用标准C库,C库里的函数调用内核函数,最终由内核负责在各个请求进程之间不失公平的进行处理。
因此,应用程序看到的内核是负责执行各种系统功能的函数集合。标准C库只是一个中间层,用于在不同的体系结构和系统之间,标准化并简化内核调用方式。
最后,大致应该明白了:系统调用是操作系统提供给应用程序访问系统资源的接口,应用程序是通过这个接口来获得操作系统的服务的,比如打开文件,读文件等。
2. 内核态与用户态
在上面我们知道,应用程序要访问系统资源,必须通过系统调用。但是这里有一个问题,为什么要这么做?如果考虑到程序的执行效率,为什么不能由应用程序直接访问系统资源呢,非要间接的通过系统调用来实现?这里就涉及到linux系统的用户态和内核态概念了。
之所以要有内核态和用户态的区分,最直接的理由就是系统安全。你想想,如果用户程序能够直接访问硬件,万一操作不当,就可能面临系统崩溃的局面。刚买的一台电脑,你刚写个程序,一运行,电脑就崩,这样的代价太大了。所以,必须要区分内核态和用户态。其他的理由请自行百度。
这里说明一点:应用程序运行在用户态,当发生系统调用时,系统会自动陷入内核态。当系统调用处理完成后,再回到用户态。在x86体系中,应用程序是通过int 0x80实现从用户态到内核态的转换的。
3. socket数据结构
<net.h>
struct socket {
socket_state state;
kmemcheck_bitfield_begin(type);
short type;
kmemcheck_bitfield_end(type);
unsigned long flags;
struct socket_wq __rcu *wq;
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
};
socket的定义并未绑定具体的协议内容,这也说明了为什么需要proto_ops指针指向一个数据结构。其中包含了用于处理套接字的特定协议的函数:
<net.h>
struct proto_ops {
int family;
struct module *owner;
int (*release) (struct socket *sock);
int (*bind) (struct socket *sock,
struct sockaddr *myaddr,
int sockaddr_len);
int (*connect) (struct socket *sock,
struct sockaddr *vaddr,
int sockaddr_len, int flags);
int (*socketpair)(struct socket *sock1,
struct socket *sock2);
int (*accept) (struct socket *sock,
struct socket *newsock, int flags);
int (*getname) (struct socket *sock,
struct sockaddr *addr,
int *sockaddr_len, int peer);
unsigned int (*poll) (struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int (*ioctl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
#ifdef CONFIG_COMPAT
int (*compat_ioctl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
#endif
int (*listen) (struct socket *sock, int len);
int (*shutdown) (struct socket *sock, int flags);
int (*setsockopt)(struct socket *sock, int level,
int optname, char __user *optval, unsigned int optlen);
int (*getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
#ifdef CONFIG_COMPAT
int (*compat_setsockopt)(struct socket *sock, int level,
int optname, char __user *optval, unsigned int optlen);
int (*compat_getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
#endif
int (*sendmsg) (struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len);
int (*recvmsg) (struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len,
int flags);
int (*mmap) (struct file *file, struct socket *sock,
struct vm_area_struct * vma);
ssize_t (*sendpage) (struct socket *sock, struct page *page,
int offset, size_t size, int flags);
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
};
许多函数指针都与C标准库函数同名。这不是巧合。因为C库函数会通过socketcall系统调用导向上述的函数指针。
4. socket与文件
在建立连接之后,用户空间进程使用普通的文件操作来访问套接字。这在内核中是如何实现的呢?这就多亏了VFS层的开放结构,只需要做很少的工作。因为对套接字文件描述符的文件操作,可以透明的重定向到网络子系统的代码中。套接字使用的file_operations 结构如下
<net/socket.c>
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read,
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
5. socketcall系统调用
linux提供了socketcall系统调用,它实现在sys_socketcall中。
#ifdef __ARCH_WANT_SYS_SOCKETCALL
/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[21] = {
AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
AL(4), AL(5), AL(4)
};
#undef AL
sys_socketcall的任务其实并不困难,它充当“socket多路分配器”,将系统调用转到其他具体的函数执行,并传递参数,后者中的每个函数都实现了一个“更小”的系统调用
/*
* System call vectors.
*
* Argument checking cleaned up. Saved 20% in size.
* This function doesn't need to set the kernel lock because
* it is set by the callees.
*/
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
unsigned long a[6];
unsigned long a0, a1;
int err;
unsigned int len;
if (call < 1 || call > SYS_SENDMMSG)
return -EINVAL;
len = nargs[call];
if (len > sizeof(a))
return -EINVAL;
/* copy_from_user should be SMP safe. */
if (copy_from_user(a, args, len))
return -EFAULT;
audit_socketcall(nargs[call] / sizeof(unsigned long), a);
a0 = a[0];
a1 = a[1];
switch (call) {
case SYS_SOCKET:
err = sys_socket(a0, a1, a[2]);
break;
case SYS_BIND:
err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_CONNECT:
err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_LISTEN:
err = sys_listen(a0, a1);
break;
case SYS_ACCEPT:
err = sys_accept4(a0, (struct sockaddr __user *)a1,
(int __user *)a[2], 0);
break;
case SYS_GETSOCKNAME:
err =
sys_getsockname(a0, (struct sockaddr __user *)a1,
(int __user *)a[2]);
break;
case SYS_GETPEERNAME:
err =
sys_getpeername(a0, (struct sockaddr __user *)a1,
(int __user *)a[2]);
break;
case SYS_SOCKETPAIR:
err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
break;
case SYS_SEND:
err = sys_send(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_SENDTO:
err = sys_sendto(a0, (void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4], a[5]);
break;
case SYS_RECV:
err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_RECVFROM:
err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4],
(int __user *)a[5]);
break;
case SYS_SHUTDOWN:
err = sys_shutdown(a0, a1);
break;
case SYS_SETSOCKOPT:
err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
break;
case SYS_GETSOCKOPT:
err =
sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
(int __user *)a[4]);
break;
case SYS_SENDMSG:
err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
break;
case SYS_SENDMMSG:
err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);
break;
case SYS_RECVMSG:
err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
break;
case SYS_RECVMMSG:
err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
(struct timespec __user *)a[4]);
break;
case SYS_ACCEPT4:
err = sys_accept4(a0, (struct sockaddr __user *)a1,
(int __user *)a[2], a[3]);
break;
default:
err = -EINVAL;
break;
}
return err;
}
可以看到,17个套接字操作只是对应这一个系统调用总入口,这是比较引人注目的。
6. 服务器端代码
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <errno.h>
#include <string.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#define N 64
int main( int argc, char *argv[] )
{
int listenfd, connfd;
struct sockaddr_in myaddr, peeraddr;
socklen_t len;
char buf[N] = {0};
ssize_t n;
if(argc < 3)
{
printf("Usage: %s ip port\n", argv[0]);
return 0;
}
if((listenfd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
{
perror("socket");
exit(-1);
}
memset(&myaddr, 0, sizeof(myaddr));
myaddr.sin_family = AF_INET;
myaddr.sin_port = htons(atoi(argv[2]));
myaddr.sin_addr.s_addr = inet_addr(argv[1]);
if(bind(listenfd, (struct sockaddr *) &myaddr, sizeof(myaddr)) == -1)
{
perror("bind");
exit(-1);
}
if(-1 == listen(listenfd, 5))
{
perror("listen");
exit(-1);
}
memset(&peeraddr, 0, sizeof(peeraddr));
len = sizeof(peeraddr);
while(1)
{
if((connfd = accept(listenfd, (struct sockaddr *)&peeraddr, &len)) == -1)
{
perror("accept");
exit(-1);
}
printf("from %s:%d\n", inet_ntoa(peeraddr.sin_addr), ntohs(peeraddr.sin_port));
while(1)
{
memset(buf, 0, sizeof(buf));
n = recv(connfd, buf, N, 0);
if( n == 0 )
{
break;
}
buf[n] = '\0';
printf("n=%d %s", n, buf);
send(connfd, buf, n, 0);
}
close(connfd);
}
return 0;
}
7. 客户端代码
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <errno.h>
#include <string.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#define N 64
int main( int argc, char *argv[] )
{
int sockfd;
struct sockaddr_in servaddr, myaddr;
char buf[N] = {0};
if (argc < 3)
{
printf("usage : %s ip port\n", argv[0]);
return 0;
}
if((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
{
perror("socket");
exit(-1);
}
#if 0
memset(&myaddr, 0, sizeof(myaddr));
myaddr.sin_family = AF_INET;
myaddr.sin_port = htons(8000);
myaddr.sin_addr.s_addr = inet_addr(argv[1]);
if(bind(sockfd, (struct sockaddr *) &myaddr, sizeof(myaddr)) == -1)
{
error("bind");
exit(-1);
}
#endif
memset(&servaddr, 0, sizeof(servaddr));
servaddr.sin_family = AF_INET;
servaddr.sin_port = htons(atoi(argv[2]));
servaddr.sin_addr.s_addr = inet_addr(argv[1]);
if(connect(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) == -1)
{
perror("socket");
exit(-1);
}
printf(">");
while(fgets(buf, N, stdin) != NULL)
{
send(sockfd, buf, strlen(buf), 0);
memset(buf, 0, sizeof(buf));
recv(sockfd, buf, N, 0);
printf("%s\n", buf);
printf(">");
}
close(sockfd);
return 0;
}
执行结果
服务器端:
linuxclass@ubuntu:~/qemulinux/net_ws/tcp$ strace ./server 127.0.0.1 12334
execve("./server", ["./server", "127.0.0.1", "12334"], [/* 60 vars */]) = 0
brk(0) = 0xbf4000
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=95728, ...}) = 0
mmap(NULL, 95728, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7ff559b32000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0P \2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1857312, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ff559b31000
mmap(NULL, 3965632, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7ff55955f000
mprotect(0x7ff55971d000, 2097152, PROT_NONE) = 0
mmap(0x7ff55991d000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1be000) = 0x7ff55991d000
mmap(0x7ff559923000, 17088, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7ff559923000
close(3) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ff559b2f000
arch_prctl(ARCH_SET_FS, 0x7ff559b2f740) = 0
mprotect(0x7ff55991d000, 16384, PROT_READ) = 0
mprotect(0x601000, 4096, PROT_READ) = 0
mprotect(0x7ff559b4a000, 4096, PROT_READ) = 0
munmap(0x7ff559b32000, 95728) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3
bind(3, {sa_family=AF_INET, sin_port=htons(12334), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
listen(3, 5) = 0
accept(3, {sa_family=AF_INET, sin_port=htons(56512), sin_addr=inet_addr("127.0.0.1")}, [16]) = 4
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 9), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7ff559b49000
write(1, "from 127.0.0.1:56512\n", 21from 127.0.0.1:56512
) = 21
可以看出,在跟踪系统调用的过程中,依次调用了socket-->bind-->listen-->accept--write
客户端:
linuxclass@ubuntu:~/qemulinux/net_ws/tcp$ strace ./client 127.0.0.1 12334
execve("./client", ["./client", "127.0.0.1", "12334"], [/* 60 vars */]) = 0
brk(0) = 0xa8c000
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=95728, ...}) = 0
mmap(NULL, 95728, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f379fbbd000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0P \2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1857312, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f379fbbc000
mmap(NULL, 3965632, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f379f5ea000
mprotect(0x7f379f7a8000, 2097152, PROT_NONE) = 0
mmap(0x7f379f9a8000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1be000) = 0x7f379f9a8000
mmap(0x7f379f9ae000, 17088, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f379f9ae000
close(3) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f379fbba000
arch_prctl(ARCH_SET_FS, 0x7f379fbba740) = 0
mprotect(0x7f379f9a8000, 16384, PROT_READ) = 0
mprotect(0x600000, 4096, PROT_READ) = 0
mprotect(0x7f379fbd5000, 4096, PROT_READ) = 0
munmap(0x7f379fbbd000, 95728) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3
connect(3, {sa_family=AF_INET, sin_port=htons(12334), sin_addr=inet_addr("127.0.0.1")}, 16) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f379fbd4000
fstat(0, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f379fbd3000
write(1, ">", 1>) = 1
read(0,
可以看出,在跟踪系统调用的过程中,依次调用了socket-->connect-->write
8. 总结
自己对socket的一点理解,谢谢!
原文:https://www.cnblogs.com/ustc-hwq/p/12069661.html