首页 > 其他 > 详细

TCP Socket Establish;UDP Send Package Process In Kernel Sourcecode Learning

时间:2014-07-30 12:02:24      阅读:454      评论:0      收藏:0      [点我收藏+]

目录

0. 引言
1. TCP握手流程
2. TCP connect() API原理
3. TCP listen() API原理
4. UDP交互过程
5. UDP send() API原理
6. UDP bind() API原理

 

0. 引言

Monitoring project

1. outer link connect: connect()->inet_stream_connect 
2. Suspicious port monitor: listen()->inet_listen 
3. SuspiciousUDP Package Send: sendto()->inet_sendmsg
4. Suspicious port monitor: bind()->inet_bind()

we need monitor the systemcall is:

1. inet_stream_connect
2. inet_listen
3. inet_sendmsg
4. inet_bind()

 

1. TCP握手流程

在和网络有关的编程中,我们常常会使用到Socket库提供的API进行开发,本节将给出一个最基本的socket tcp的ring3代码,在之后的章节,我们将一起从内核源代码的角度学习一下lixnu的网络连接在内核协议栈是怎么实现的

client:

#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
 
//以下头文件是为了使样例程序正常运行
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

int main(int argc, char const *argv[])
{
    struct sockaddr_in pin;
    struct hostent *nlp_host;
    int sd; 
    char host_name[256];
    int port;
     
    //初始化主机名和端口。主机名可以是IP,也可以是可被解析的名称
    strcpy(host_name,"127.0.0.1");
    port=8000;
     
    //解析域名,如果是IP则不用解析,如果出错,显示错误信息
    while ((nlp_host=gethostbyname(host_name))==0)
    {
        printf("Resolve Error!\n");
    }
      
    //设置pin变量,包括协议、地址、端口等,此段可直接复制到自己的程序中
    bzero(&pin,sizeof(pin));
    pin.sin_family = AF_INET;                 //AF_INET表示使用IPv4
    pin.sin_addr.s_addr=htonl(INADDR_ANY);  
    pin.sin_addr.s_addr=((struct in_addr *)(nlp_host->h_addr))->s_addr;
    pin.sin_port=htons(port);
     
    //建立socket
    sd=socket(AF_INET,SOCK_STREAM,0);
     
    //建立连接
    while (connect(sd,(struct sockaddr*)&pin,sizeof(pin))==-1)
    {
      printf("Connect Error!\n");
    }
    return 0;
}

server.c

#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
 
//以下头文件是为了使样例程序正常运行
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

int main(int argc, char const *argv[])
{
    int serverSocket;
    struct sockaddr_in serverAddr;
    struct sockaddr_in clientAddr;
     
    // 用port保存使用的端口
    int port=8000;
     
    // 建立Socket,并设置
    serverSocket = socket(AF_INET, SOCK_STREAM, 0);
     
    // 设置socket选项,这是可选的,可以避免服务器程序结束后无法快速重新运行
    int val=1;
    setsockopt(serverSocket, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
     
    // 定义端口和监听的地址
    serverAddr.sin_family      = AF_INET;
    serverAddr.sin_port        = htons(port);
    serverAddr.sin_addr.s_addr = htonl(INADDR_ANY);
    memset(&(serverAddr.sin_zero), 0, 8);
    int rc = bind(serverSocket, (struct sockaddr*) &serverAddr, sizeof(struct sockaddr));
    if (rc == -1) 
    {
        printf("Bad bind\n");
        exit(1);
    }
     
    // 让serverSocket开始监听,客户队列长为5
    rc = listen(serverSocket, 5);
    if (rc == -1) 
    {
        printf("Bad listen\n");
        exit(1);
    }
     
    // 等待客户连接
    int sock;
    int clientAddrSize = sizeof(struct sockaddr_in);
    sock = accept(serverSocket, (struct sockaddr*) &clientAddr, (socklen_t*) &clientAddrSize);
    if(sock != -1)
    {
        printf("tcp connection established ok! \n");
    }
    else
    {
        printf("error!!\n");
    }

    return 0;
}

从代码中可以看到,在一个Tcp握手连接的过程中,最核心的两个API就是connect()、accept()。我们接下来逐一学习

 

2. connect() API原理

我们知道,对于每一个ring3的C库中的API,都有一个对应的系统调用(C库的API只是对系统调用的一个封装wraper)来实现该接口的功能
整个流程如下:

/*
在ring3层调用socket的API: connect()
*/
1. ring3: 
    1) connect()->

/*
进入ring0层
*/
2. ring0: 
    /*
    SYSCALL_DEFINE2(socketcall..等效于调用sys_socketcall(..
    */
    1) SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
    
    /*
    SYSCALL_DEFINE3(connect..等效于调用sys_connect(..
    */
    2) SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen)-> 

    3) inet_stream_connect()->

    4) tcp_v4_connect()->

    5) tcp_connect()->
        5.1) tcp_connect_init()->
        5.2) tcp_transmit_skb()-> 
    6) 回到inet_stream_connect()->

    7) inet_wait_for_connect()进入SYN等待状态

我们来详细跟踪一下内核源代码,分析的过程中涉及到很多的.c文件和很多的函数嵌套调用,希望朋友们能耐心的结合注释跟着代码的逻辑理解socket tcp的建立过程

\linux-2.6.32.63\net\socket.c

/*
进行调用派发
\linux-2.6.32.63\net\socket.c
*/ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[6]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_ACCEPT4) return -EINVAL; len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; audit_socketcall(nargs[call] / sizeof(unsigned long), a); a0 = a[0]; a1 = a[1]; /* 进入内核的sys_socketcall()系统调用后,linux系统需要将特定的调用分路发送到指定目标,例如 1) sockcet()调用将由sys_socket()完成 2) send()调用将由sys_send()完成 系统在include/linux/net.h中定义了以下17种类型socketcall 1) #define SYS_SOCKET 1: sys_socket(2) 2) #define SYS_BIND 2: sys_bind(2) 3) #define SYS_CONNECT 3: sys_connect(2) 4) #define SYS_LISTEN 4: sys_listen(2) 5) #define SYS_ACCEPT 5: sys_accept(2) 6) #define SYS_GETSOCKNAME 6: sys_getsockname(2) 7) #define SYS_GETPEERNAME 7: sys_getpeername(2) 8) #define SYS_SOCKETPAIR 8: sys_socketpair(2) 9) #define SYS_SEND 9: sys_send(2) 10) #define SYS_RECV 10: sys_recv(2) 11) #define SYS_SENDTO 11: sys_sendto(2) 12) #define SYS_RECVFROM 12: sys_recvfrom(2) 13) #define SYS_SHUTDOWN 13: sys_shutdown(2) 14) #define SYS_SETSOCKOPT 14: sys_setsockopt(2) 15) #define SYS_GETSOCKOPT 15: sys_getsockopt(2) 16) #define SYS_SENDMSG 16: sys_sendmsg(2) 17) #define SYS_RECVMSG 17: sys_recvmsg(2) 18) #define SYS_ACCEPT4 18: sys_accept4(2) */ switch (call) SYS_SOCKET { case : err = sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: /* sys_connect()函数的声明和实现在当前文件的 SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) 中实现 */ err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = sys_listen(a0, a1); break; case SYS_ACCEPT: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: err = sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = sys_send(a0, (void __user *)a1, a[2], a[3]); break; case SYS_SENDTO: err = sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = sys_recv(a0, (void __user *)a1, a[2], a[3]); break; case SYS_RECVFROM: err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_RECVMSG: err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_ACCEPT4: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; } /* 调用sys_connect()
\linux-2.6.32.63\net\socket.c
*/ SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { struct socket *sock; struct sockaddr_storage address; int err, fput_needed; /* 找到文件描述符对应的BSD socket结构,在前面的socket调用中建立 */ sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) { goto out; } /* copy对端的地址到内核空间 */ err = move_addr_to_kernel(uservaddr, addrlen, (struct sockaddr *)&address); if (err < 0) { goto out_put; } err = security_socket_connect(sock, (struct sockaddr *)&address, addrlen); if (err) { goto out_put; } /* 调用该BSD socket对应的connect调用 这里的sock是在前面的socket调用的时候就初始化的,对应于TCP的BSD socket的操作符集是inet_stream_ops 要注意的是: 网络子模块在内核初始化的时候(linux-2.6.32.63\net\ipv4\af_inet.c->inet_init()中)就注册了TCP,UDP和RAW3中协议 linux-2.6.32.63\net\ipv4\af_inet.c const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = tcp_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = tcp_sendpage, .splice_read = tcp_splice_read, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif };
   EXPORT_SYMBOL(inet_stream_ops); 从上面的结构中可以看到connect对应的函数是inet_stream_connect,我们继续分析inet_stream_connect函数
   \linux-2.6.32.63\net\ipv4\af_inet.c
*/ err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, sock->file->f_flags); out_put: /* 释放文件的引用 */ fput_light(sock->file, fput_needed); out: return err; } /* 调用inet_stream_connect()
\linux-2.6.32.63\net\ipv4\af_inet.c
*/ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; int err; long timeo; lock_sock(sk); if (uaddr->sa_family == AF_UNSPEC) { err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; } switch (sock->state) { default: err = -EINVAL; goto out; /* 该BSD socket已连接 */ case SS_CONNECTED: err = -EISCONN; goto out; /* 该BSD socket正在连接 */ case SS_CONNECTING: err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; /* INET SOCKET调用协议特有connect操作符 对于INET socket中的tcp连接,协议特有操作符集为tcp_prot linux-2.6.32.63\net\ipv4\tcp_ipv4.c struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .recvmsg = tcp_recvmsg, .backlog_rcv = tcp_v4_do_rcv, .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem = sysctl_tcp_wmem, .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), .slab_flags = SLAB_DESTROY_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif }; 可以看出,对于TCP,流程进入tcp_v4_connect函数(.connect对应的是tcp_v4_connect()函数),我们继续跟进tcp_v4_connect
     \linux-2.6.32.63\net\ipv4\tcp_ipv4.c
*/ err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; /* 上面的调用完成后,连接并没有完成 */ sock->state = SS_CONNECTING; /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ err = -EINPROGRESS; break; } /* 获取连接超时时间 */ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { /* Error code is set above 进入定时等待 */ if (!timeo || !inet_wait_for_connect(sk, timeo)) goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } /* Connection was closed by RST, timeout, ICMP error * or another process disconnected us. */ if (sk->sk_state == TCP_CLOSE) goto sock_error; /* sk->sk_err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ sock->state = SS_CONNECTED; err = 0; out: release_sock(sk); return err; sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; if (sk->sk_prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out; } EXPORT_SYMBOL(inet_stream_connect); /* 调用tcp_v4_connect()
\linux-2.6.32.63\net\ipv4\tcp_ipv4.c 该函数的主要功能是: 1. 准备好路由,如果源端口没有指定,还要选择一个端口,然后再次更新路由信息 2. 代表该连接的sk结构加入到对应的hash表中(已连接ehash) 3. 获取一个write_seq,以及当 sysctl_tw_recycle设置时,读取上次连接(如果存在)进入time-wait时保存的时戳,赋给当前连接的rx_opt结构中 最后该函数调用tcp_connect来完成连接
*/ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct rtable *rt; __be32 daddr, nexthop; int tmp; int err; struct ip_options_rcu *inet_opt; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; /* 开始准备路由 */ nexthop = daddr = usin->sin_addr.s_addr; inet_opt = inet->inet_opt; if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; nexthop = inet_opt->opt.faddr; } /* 调用路由模块获取出口信息 */ tmp = ip_route_connect(&rt, nexthop, inet->saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, inet->sport, usin->sin_port, sk, 1); if (tmp < 0) { if (tmp == -ENETUNREACH) IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return tmp; } /* 如果获取的路由是广播或多播域, 返回网络不可达,tcp不支持多播与广播 */ if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } if (!inet_opt || !inet_opt->opt.srr) daddr = rt->rt_dst; if (!inet->saddr) inet->saddr = rt->rt_src; inet->rcv_saddr = inet->saddr; if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; tp->write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { struct inet_peer *peer = rt_get_peer(rt); /* * VJ‘s idea. We save last timestamp seen from * the destination in peer table, when entering state * TIME-WAIT * and initialize rx_opt.ts_recent from it, * when trying new connection. */ if (peer != NULL && peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } } inet->dport = usin->sin_port; inet->daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; tp->rx_opt.mss_clamp = 536; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk); if (err) goto failure; /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, inet->daddr, inet->sport, usin->sin_port); /* id是IP包头的id域 */ inet->id = tp->write_seq ^ jiffies; /* 调用tcp_connect来完成连接 我们继续跟踪tcp_connect() \linux-2.6.32.63\net\ipv4\tcp_output.c */ err = tcp_connect(sk); rt = NULL; if (err) goto failure; return 0; failure: /* * This unhashes the socket and releases the local port, * if necessary. */ tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; inet->dport = 0; return err; } /* 在tcp_v4_connect()的最后,调用tcp_connect来完成连接
\linux-2.6.32.63\net\ipv4\tcp_output.c 该函数的流程是 1. 初始化该次连接的sk结构 2. 分配一个skb数据包 3. 初始化skb 4 skb加入发送队列后,调用tcp_transmit_skb发送该数据包 5. 更新snd_nxt,并启动超时定时器
*/ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; /* 初始化连接对应的INET socket结构的参数,为连接做准备
  下面跟进tcp_connect_init()
  \linux-2.6.32.63\net\ipv4\tcp_output.c
*/ tcp_connect_init(sk); /* 获取一个skb,由于是syn包,没有数据,所以大小是MAX_TCP_HEADER的16位对齐 */ buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); tp->snd_nxt = tp->write_seq; /* 设置skb相关参数 */ tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); /* 设置ECN */ TCP_ECN_send_syn(sk, buff); /* Send it off. */ /* 保存该数据包的发送时间 */ TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; skb_header_release(buff); /* 加入发送队列,待确认后在丢弃 */ __tcp_add_write_queue_tail(sk, buff); sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); tp->packets_out += tcp_skb_pcount(buff); /* 发送数据包,调用tcp_transimit_skb函数
  下面跟进tcp_transmit_skb进行分析
  \linux-2.6.32.63\net\ipv4\tcp_output.c
*/ tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } /* 接下来跟进一下tcp_connect_init函数,它负责sk结构的初始化(在tcp_connect()的一开始)
\linux-2.6.32.63\net\ipv4\tcp_output.c
*/ static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; /* We‘ll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr) + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); #ifdef CONFIG_TCP_MD5SIG if (tp->af_specific->md5_lookup(sk, sk) != NULL) tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; /* 初始化MTU probe */ tcp_mtup_init(sk); /* 设置mss */ tcp_sync_mss(sk, dst_mtu(dst)); if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) tp->advmss = tp->rx_opt.user_mss; tcp_initialize_rcv_mss(sk); /* 根据接收空间大小初始化一个通告窗口 */ tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, &rcv_wscale); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; sk->sk_err = 0; sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; /* 更新一些滑动窗口的成员 */ tcp_init_wl(tp, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; tp->rcv_nxt = 0; tp->rcv_wup = 0; tp->copied_seq = 0; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } /* 我们继续分析,tcp_connect(),在tcp_connect()的最后,函数调用了tcp_transmit_skb()进行数据报发送
\linux-2.6.32.63\net\ipv4\tcp_output.c
*/ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; __u8 *md5_hash_location; struct tcphdr *th; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); /* If congestion control is doing timestamping, we must * take such a timestamp before we potentially clone/copy. */ if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP) __net_timestamp(skb); /* 因为发送时要保留skb以备重传,所以大部分时候都设置了clone_it,不过发送ack除外 */ if (likely(clone_it)) { if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else skb = skb_clone(skb, gfp_mask); if (unlikely(!skb)) return -ENOBUFS; } inet = inet_sk(sk); tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); /* 获取tcp选项部分信息,分为syn包和普通包2部分,因为有的选项只在syn中设置 */ if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); else tcp_options_size = tcp_established_options(sk, skb, &opts, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* 该函数判断是否网络中有数据存在,没有就通知拥塞控制模块 */ if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_set_owner_w(skb, sk); /* Build TCP header and checksum it. */ th = tcp_hdr(skb); th->source = inet->sport; th->dest = inet->dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { /* RFC1323: The window in SYN & SYN/ACK segments is never scaled.SYN包和SYN/ACK包中窗口不扩放 */ th->window = htons(min(tp->rcv_wnd, 65535U)); } else { th->window = htons(tcp_select_window(sk)); } th->check = 0; th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { if (before(tp->snd_up, tcb->seq + 0x10000)) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { th->urg_ptr = 0xFFFF; th->urg = 1; } } /* 填充TCP 选项字段 */ tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) /* 如果不是SYN包的话,就检查是否需要向对方发送ECN */ TCP_ECN_send(sk, skb, tcp_header_size); #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; tp->af_specific->calc_md5_hash(md5_hash_location, md5, sk, NULL, skb); } #endif /* 校验和相关计算 */ icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); /* 调用IP层的发送函数 */ err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) return err; tcp_enter_cwr(sk, 1); return net_xmit_eval(err); } /* skb发送后,connect并没有返回,因为此时连接还没有建立,tcp进入等待状态,此时回到前面的inet_stream_connect函数 我们继续跟进分析inet_wait_for_connect()
\linux-2.6.32.63\net\ipv4\af_inet.c 在发送syn后进入等待状态
*/ static long inet_wait_for_connect(struct sock *sk, long timeo) { DEFINE_WAIT(wait); /* sk_sleep保存此INET SOCKET的等待队列 */ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. * Connect() does not allow to get error notifications * without closing the socket. */ /* 定时等待知道状态变化 */ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); if (signal_pending(current) || !timeo) break; prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); } finish_wait(sk->sk_sleep, &wait); return timeo; }

到此为止的代码,只是完成了connect()过程中的SYN包的发送过程的分析,要完成一个完整的TCP连接,还需要有被动方的SYN/ACK、以及发送方的ACK过程,这里就暂时不做深入分析了,因为我只关心发送方主动向外进行连接的"connect()动作",即"inet_stream_connect"这个系统调用

有关TCP握手的完整过程,请参阅下面的链接

Relevant Link:

http://docs.huihoo.com/joyfire.net/6-1.html
http://blog.csdn.net/bycy2005/article/details/3096861
http://blog.csdn.net/chensichensi/article/details/5272346
http://blog.csdn.net/chensichensi/article/details/5272696
http://blog.csdn.net/chensichensi/article/details/5273481

 

3. listen() API原理

listen()从ring3到ring0的整个流程如下:

/*
在ring3层调用socket的API: listen()
*/
1. ring3: 
    1) listen()->

/*
进入ring0层
*/
2. ring0: 
    /*
    SYSCALL_DEFINE2(socketcall..等效于调用sys_socketcall(..
    */
    1) SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
    
    /*
    SYSCALL_DEFINE2(listen..等效于调用sys_listen(..
    */
    2) SYSCALL_DEFINE2(listen, int, fd, int, backlog)-> 

    3) inet_listen()->
    
    /*
    监听指定端口
    */
    4) inet_csk_listen_start()-> 

内核源代码分析

\linux-2.6.32.63\net\socket.c

/*
进行调用派发
\linux-2.6.32.63\net\socket.c
*/ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[6]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_ACCEPT4) return -EINVAL; len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; audit_socketcall(nargs[call] / sizeof(unsigned long), a); a0 = a[0]; a1 = a[1]; /* 进入内核的sys_socketcall()系统调用后,linux系统需要将特定的调用分路发送到指定目标,例如 1) sockcet()调用将由sys_socket()完成 2) send()调用将由sys_send()完成 系统在include/linux/net.h中定义了以下17种类型socketcall 1) #define SYS_SOCKET 1: sys_socket(2) 2) #define SYS_BIND 2: sys_bind(2) 3) #define SYS_CONNECT 3: sys_connect(2) 4) #define SYS_LISTEN 4: sys_listen(2) 5) #define SYS_ACCEPT 5: sys_accept(2) 6) #define SYS_GETSOCKNAME 6: sys_getsockname(2) 7) #define SYS_GETPEERNAME 7: sys_getpeername(2) 8) #define SYS_SOCKETPAIR 8: sys_socketpair(2) 9) #define SYS_SEND 9: sys_send(2) 10) #define SYS_RECV 10: sys_recv(2) 11) #define SYS_SENDTO 11: sys_sendto(2) 12) #define SYS_RECVFROM 12: sys_recvfrom(2) 13) #define SYS_SHUTDOWN 13: sys_shutdown(2) 14) #define SYS_SETSOCKOPT 14: sys_setsockopt(2) 15) #define SYS_GETSOCKOPT 15: sys_getsockopt(2) 16) #define SYS_SENDMSG 16: sys_sendmsg(2) 17) #define SYS_RECVMSG 17: sys_recvmsg(2) 18) #define SYS_ACCEPT4 18: sys_accept4(2) */ switch (call) SYS_SOCKET { case : err = sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: /* SYSCALL_DEFINE2(listen, int, fd, int, backlog) 等效于调用sys_listen 继续跟进SYSCALL_DEFINE2(listen, int, fd, int, backlog) */ err = sys_listen(a0, a1); break; case SYS_ACCEPT: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: err = sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = sys_send(a0, (void __user *)a1, a[2], a[3]); break; case SYS_SENDTO: err = sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = sys_recv(a0, (void __user *)a1, a[2], a[3]); break; case SYS_RECVFROM: err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_RECVMSG: err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_ACCEPT4: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; } /* 跟进SYSCALL_DEFINE2(listen, int, fd, int, backlog)进行分析 在sys_listen函数中,系统是调用"inet_listen"这个系统调用完成listen功能的
\linux-2.6.32.63\net\socket.c
*/ SYSCALL_DEFINE2(listen, int, fd, int, backlog) { struct socket *sock; int err, fput_needed; int somaxconn; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; if ((unsigned)backlog > somaxconn) backlog = somaxconn; err = security_socket_listen(sock, backlog); if (!err) /* 调用该BSD socket对应的listen调用 这里的sock是在前面的socket调用的时候就初始化的,对应于TCP的BSD socket的操作符集是inet_stream_ops 要注意的是: 网络子模块在内核初始化的时候(linux-2.6.32.63\net\ipv4\af_inet.c->inet_init()中)就注册了TCP,UDP和RAW3中协议 linux-2.6.32.63\net\ipv4\af_inet.c const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = tcp_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = tcp_sendpage, .splice_read = tcp_splice_read, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif };
       EXPORT_SYMBOL(inet_stream_ops); 从上面的结构中可以看到listen对应的函数是inet_listen,我们继续分析inet_listen函数 \linux-2.6.32.63\net\ipv4\af_inet.c
*/ err = sock->ops->listen(sock, backlog); fput_light(sock->file, fput_needed); } return err; } /* 我们继续跟进inet_listen() \linux-2.6.32.63\net\ipv4\af_inet.c */ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; /* 在进行listen初始化之前对sk结构体进行加锁 */ lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto out; old_state = sk->sk_state; if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; /* Really, if the socket is already in listen state we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { /* 调用inet_csk_listen_start()进行端口监听 我们继续跟进inet_csk_listen_start()进行分析 \linux-2.6.32.63\net\ipv4\inet_connection_sock.c */ err = inet_csk_listen_start(sk, backlog); if (err) goto out; } sk->sk_max_ack_backlog = backlog; err = 0; out: /* 解锁 */ release_sock(sk); return err; } EXPORT_SYMBOL(inet_listen); /* 我们继续跟进inet_csk_listen_start() \linux-2.6.32.63\net\ipv4\inet_connection_sock.c */ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); if (rc != 0) return rc; sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ sk->sk_state = TCP_LISTEN; if (!sk->sk_prot->get_port(sk, inet->num)) { inet->sport = htons(inet->num); sk_dst_reset(sk); sk->sk_prot->hash(sk); return 0; } sk->sk_state = TCP_CLOSE; __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(inet_csk_listen_start);

 

 

4. UDP交互过程

UDP的数据报发送常见于一些隐蔽通道、后门端口(例如icmp通道、dns隐蔽通道等等),和TCP不同的是,UDP是一种无状态的协议,这意味着它不需要建立三次握手连接即可在通信双方之间发送数据

bubuko.com,布布扣

bubuko.com,布布扣

client.c

#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>

#define ERR_EXIT(m)         do         {                 perror(m);                 exit(EXIT_FAILURE);         } while(0)

void echo_cli(int sock)
{
    struct sockaddr_in servaddr;
    memset(&servaddr, 0, sizeof(servaddr));
    servaddr.sin_family = AF_INET;
    servaddr.sin_port = htons(5188);
    servaddr.sin_addr.s_addr = inet_addr("127.0.0.1");

    int ret;
    char sendbuf[1024] = {0};
    char recvbuf[1024] = {0};
    while (fgets(sendbuf, sizeof(sendbuf), stdin) != NULL)
    { 
        sendto(sock, sendbuf, strlen(sendbuf), 0, (struct sockaddr *)&servaddr, sizeof(servaddr));

        ret = recvfrom(sock, recvbuf, sizeof(recvbuf), 0, NULL, NULL);
        if (ret == -1)
        {
            if (errno == EINTR)
                continue;
            ERR_EXIT("recvfrom");
        }

        fputs(recvbuf, stdout);
        memset(sendbuf, 0, sizeof(sendbuf));
        memset(recvbuf, 0, sizeof(recvbuf));
    }

    close(sock);


}

int main(void)
{
    int sock;
    if ((sock = socket(PF_INET, SOCK_DGRAM, 0)) < 0)
        ERR_EXIT("socket");

    echo_cli(sock);

    return 0;
}

server.c

#include<stdio.h>
#include<stdlib.h>
#include<unistd.h>
#include<errno.h>
#include<sys/types.h>
#include<sys/socket.h>
#include<netinet/in.h>
#include<string.h>

#define ERR_EXIT(m)     do {         perror(m);         exit(EXIT_FAILURE);     } while (0)

void echo_ser(int sock)
{
    char recvbuf[1024] = {0};
    struct sockaddr_in peeraddr;
    socklen_t peerlen;
    int n;

    while (1)
    {

        peerlen = sizeof(peeraddr);
        memset(recvbuf, 0, sizeof(recvbuf));
        n = recvfrom(sock, recvbuf, sizeof(recvbuf), 0,
                     (struct sockaddr *)&peeraddr, &peerlen);
        if (n == -1)
        {

            if (errno == EINTR)
                continue;

            ERR_EXIT("recvfrom error");
        }
        else if(n > 0)
        {

            fputs(recvbuf, stdout);
            sendto(sock, recvbuf, n, 0,
                   (struct sockaddr *)&peeraddr, peerlen);
        }
    }
    close(sock);
}

int main(void)
{
    int sock;
    if ((sock = socket(PF_INET, SOCK_DGRAM, 0)) < 0)
        ERR_EXIT("socket error");

    struct sockaddr_in servaddr;
    memset(&servaddr, 0, sizeof(servaddr));
    servaddr.sin_family = AF_INET;
    servaddr.sin_port = htons(5188);
    servaddr.sin_addr.s_addr = htonl(INADDR_ANY);

    if (bind(sock, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0)
        ERR_EXIT("bind error");

    echo_ser(sock);

    return 0;
}

Relevant Link:

http://blog.csdn.net/jnu_simba/article/details/9077455
http://www.cplusplus.me/1070.html

 

5. UDP sendto() API原理

sendto()从ring3到ring0的整个流程如下:

/*
在ring3层调用socket的API: sendto()
*/
1. ring3: 
    1) sendto()->

/*
进入ring0层
*/
2. ring0: 
    /*
    SYSCALL_DEFINE2(socketcall..等效于调用sys_socketcall(..
    */
    1) SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
    
    /*
    SYSCALL_DEFINE6(sendto..等效于调用sys_sendto(..
    */
    2) SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, unsigned, flags, struct sockaddr __user *, addr, int, addr_len)-> 

    3) sock_sendmsg()->

    4) __sock_sendmsg()->

    5) inet_sendmsg()->
    
    /*
    发送UDP数据报
    */
    6) udp_sendmsg()-> 

\linux-2.6.32.63\net\socket.c

/*
进行调用派发
\linux-2.6.32.63\net\socket.c
*/ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[6]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_ACCEPT4) return -EINVAL; len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; audit_socketcall(nargs[call] / sizeof(unsigned long), a); a0 = a[0]; a1 = a[1]; /* 进入内核的sys_socketcall()系统调用后,linux系统需要将特定的调用分路发送到指定目标,例如 1) sockcet()调用将由sys_socket()完成 2) send()调用将由sys_send()完成 系统在include/linux/net.h中定义了以下17种类型socketcall 1) #define SYS_SOCKET 1: sys_socket(2) 2) #define SYS_BIND 2: sys_bind(2) 3) #define SYS_CONNECT 3: sys_connect(2) 4) #define SYS_LISTEN 4: sys_listen(2) 5) #define SYS_ACCEPT 5: sys_accept(2) 6) #define SYS_GETSOCKNAME 6: sys_getsockname(2) 7) #define SYS_GETPEERNAME 7: sys_getpeername(2) 8) #define SYS_SOCKETPAIR 8: sys_socketpair(2) 9) #define SYS_SEND 9: sys_send(2) 10) #define SYS_RECV 10: sys_recv(2) 11) #define SYS_SENDTO 11: sys_sendto(2) 12) #define SYS_RECVFROM 12: sys_recvfrom(2) 13) #define SYS_SHUTDOWN 13: sys_shutdown(2) 14) #define SYS_SETSOCKOPT 14: sys_setsockopt(2) 15) #define SYS_GETSOCKOPT 15: sys_getsockopt(2) 16) #define SYS_SENDMSG 16: sys_sendmsg(2) 17) #define SYS_RECVMSG 17: sys_recvmsg(2) 18) #define SYS_ACCEPT4 18: sys_accept4(2) */ switch (call) SYS_SOCKET { case : err = sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = sys_listen(a0, a1); break; case SYS_ACCEPT: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: err = sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = sys_send(a0, (void __user *)a1, a[2], a[3]); break; /* 等效于调用 SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, unsigned, flags, struct sockaddr __user *, addr, int, addr_len) */ case SYS_SENDTO: err = sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = sys_recv(a0, (void __user *)a1, a[2], a[3]); break; case SYS_RECVFROM: err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_RECVMSG: err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_ACCEPT4: err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; } /* 调用 SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, unsigned, flags, struct sockaddr __user *, addr, int, addr_len)
\linux-2.6.32.63\net\socket.c
*/ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, unsigned, flags, struct sockaddr __user *, addr, int, addr_len) { struct socket *sock; struct sockaddr_storage address; int err; struct msghdr msg; struct iovec iov; int fput_needed; if (len > INT_MAX) len = INT_MAX; /* 通过函数sockfd_lookup_light和参数fd,来得到对应的sock sockfd_lookup_light的实现比较简单 1. fd就是进程的fdtable的索引 2. 通过这个fd索引就可以得到对应的file指针 3. 然后在从file指针中,得到sock的地址 static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) { struct file *file; struct socket *sock; *err = -EBADF; file = fget_light(fd, fput_needed); if (file) { sock = sock_from_file(file, err); if (sock) return sock; fput_light(file, *fput_needed); } return NULL; } */ sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; /* 初始化iov和msg,因为这里的消息传递方式采用的是BSD的消息传递方式 struct msghdr { void * msg_name; // Socket name int msg_namelen; // Length of name struct iovec * msg_iov; // Data blocks __kernel_size_t msg_iovlen; // Number of blocks void * msg_control; // Per protocol magic (eg BSD file descriptor passing) __kernel_size_t msg_controllen;// Length of cmsg list unsigned msg_flags; }; */ iov.iov_base = buff; iov.iov_len = len; msg.msg_name = NULL; msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; /* 1. 如果sendto指定了addr,那么首先将用户空间的地址addr复制到kernel空间的address中,并用内核空间的address来初始化msg */ if (addr) { err = move_addr_to_kernel(addr, addr_len, (struct sockaddr *)&address); if (err < 0) goto out_put; msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } /* 如果该socket指定了O_NONBLOCK,那么将flags设置上MSG_DONTWAIT,并将flags赋给msg.msg_flags */ if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; /* 最后调用sock_sendmsg,将msg发送出去 我们继续跟进sock_sendmsg()分析
  \linux-2.6.32.63\net\socket.c
*/ err = sock_sendmsg(sock, &msg, len); out_put: fput_light(sock->file, fput_needed); out: return err; } /* 继续跟进sock_sendmsg()分析 \linux-2.6.32.63\net\socket.c */ int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct kiocb iocb; struct sock_iocb siocb; int ret; init_sync_kiocb(&iocb, NULL); iocb.private = &siocb; /* 调用__sock_sendmsg进行UDP数据报的发送 继续跟进__sock_sendmsg()进行分析
   \linux-2.6.32.63\net\socket.c
*/ ret = __sock_sendmsg(&iocb, sock, msg, size); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&iocb); return ret; } /* 继续跟进__sock_sendmsg()分析 \linux-2.6.32.63\net\socket.c */ static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock_iocb *si = kiocb_to_siocb(iocb); int err; si->sock = sock; si->scm = NULL; si->msg = msg; si->size = size; err = security_socket_sendmsg(sock, msg, size); if (err) return err; /* const struct proto_ops inet_dgram_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, .poll = udp_poll, .ioctl = inet_ioctl, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif }; EXPORT_SYMBOL(inet_dgram_ops); 从结构体中可以看出,sendmsg()对应的系统调用是inet_sendmsg() 我们继续跟进分析inet_sendmsg() \linux-2.6.32.63\net\ipv4\af_inet.c */ return sock->ops->sendmsg(iocb, sock, msg, size); } /* 继续跟进分析inet_sendmsg() \linux-2.6.32.63\net\ipv4\af_inet.c */ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; /* We may need to bind the socket. */ if (!inet_sk(sk)->num && inet_autobind(sk)) return -EAGAIN; /* INET SOCKET调用协议特有sendmsg操作符 对于INET socket中的udp发送,协议特有操作符集为udp_prot linux-2.6.32.63\net\ipv4\udp.c struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .sendpage = udp_sendpage, .backlog_rcv = __udp_queue_rcv_skb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_rmem = &sysctl_udp_rmem_min, .obj_size = sizeof(struct udp_sock), .slab_flags = SLAB_DESTROY_BY_RCU, .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif }; EXPORT_SYMBOL(udp_prot); 可以看出,对于UDP,流程进入udp_sendmsg函数(.sendmsg对应的是udp_sendmsg()函数),我们继续跟进udp_sendmsg() \linux-2.6.32.63\net\ipv4\udp.c */ return sk->sk_prot->sendmsg(iocb, sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg); /* 继续跟进udp_sendmsg() \linux-2.6.32.63\net\ipv4\udp.c */ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); int ulen = len; struct ipcm_cookie ipc; struct rtable *rt = NULL; int free = 0; int connected = 0; __be32 daddr, faddr, saddr; __be16 dport; u8 tos; int err, is_udplite = IS_UDPLITE(sk); int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct ip_options_data opt_copy; if (len > 0xFFFF) return -EMSGSIZE; /* * Check the flags. */ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; ipc.opt = NULL; ipc.shtx.flags = 0; if (up->pending) { /* * There are pending frames. * The socket lock must be held while it‘s corked. */ lock_sock(sk); if (likely(up->pending)) { if (unlikely(up->pending != AF_INET)) { release_sock(sk); return -EINVAL; } goto do_append_data; } release_sock(sk); } ulen += sizeof(struct udphdr); /* * Get and verify the address. */ if (msg->msg_name) { struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name; if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { if (usin->sin_family != AF_UNSPEC) return -EAFNOSUPPORT; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; if (dport == 0) return -EINVAL; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->daddr; dport = inet->dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set. */ connected = 1; } ipc.addr = inet->saddr; ipc.oif = sk->sk_bound_dev_if; err = sock_tx_timestamp(msg, sk, &ipc.shtx); if (err) return err; if (msg->msg_controllen) { err = ip_cmsg_send(sock_net(sk), msg, &ipc); if (err) return err; if (ipc.opt) free = 1; connected = 0; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) return -EINVAL; faddr = ipc.opt->opt.faddr; connected = 0; } tos = RT_TOS(inet->tos); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || (ipc.opt && ipc.opt->opt.is_strictroute)) { tos |= RTO_ONLINK; connected = 0; } if (ipv4_is_multicast(daddr)) { if (!ipc.oif) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; connected = 0; } if (connected) rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { struct flowi fl = { .oif = ipc.oif, .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = faddr, .saddr = saddr, .tos = tos } }, .proto = sk->sk_protocol, .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { .sport = inet->sport, .dport = dport } } }; struct net *net = sock_net(sk); security_sk_classify_flow(sk, &fl); err = ip_route_output_flow(net, &rt, &fl, sk, 1); if (err) { if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (connected) sk_dst_set(sk, dst_clone(&rt->u.dst)); } if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: saddr = rt->rt_src; if (!ipc.addr) daddr = ipc.addr = rt->rt_dst; lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ /* ... which is an evident application bug. --ANK */ release_sock(sk); LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); err = -EINVAL; goto out; } /* * Now cork the socket to pend data. */ inet->cork.fl.fl4_dst = daddr; inet->cork.fl.fl_ip_dport = dport; inet->cork.fl.fl4_src = saddr; inet->cork.fl.fl_ip_sport = inet->sport; up->pending = AF_INET; do_append_data: up->len += ulen; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_flush_pending_frames(sk); else if (!corkreq) err = udp_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) up->pending = 0; release_sock(sk); out: ip_rt_put(rt); if (free) kfree(ipc.opt); if (!err) return len; /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it‘s not tunable per se), but otherwise * we don‘t have a good statistic (IpOutDiscards but it can be too many * things). We could add another new stat but at least for now that * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); } return err; do_confirm: dst_confirm(&rt->u.dst); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } EXPORT_SYMBOL(udp_sendmsg);

Relevant Link:

http://blog.chinaunix.net/uid-23629988-id-85912.html

 

6. UDP bind() API原理

对于UDP的服务端程序来说(很多隐藏通道、icmp、iis恶意模块...)都需要监听一个UDP端口,对于TCP服务端程序,可以使用listen监听指定的IP和端口,但是对于UDP,它必须使用bind()进行端口绑定

bind()从ring3到ring0的整个流程如下:

/*
在ring3层调用socket的API: bind()
*/
1. ring3: 
    1) bind()->

/*
进入ring0层
*/
2. ring0: 
    /*
    SYSCALL_DEFINE2(socketcall..等效于调用sys_socketcall(..
    */
    1) SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
    
    /*
    SYSCALL_DEFINE3(bind..等效于调用sys_bind(..
    */
    2) SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)-> 
    
    /*
    绑定指定端口
    */
    3) inet_bind()->

\linux-2.6.32.63\net\socket.c

/*
进行调用派发
\linux-2.6.32.63\net\socket.c
*/ 
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
    unsigned long a[6];
    unsigned long a0, a1;
    int err;
    unsigned int len;

    if (call < 1 || call > SYS_ACCEPT4)
        return -EINVAL;

    len = nargs[call];
    if (len > sizeof(a))
        return -EINVAL;

    /* copy_from_user should be SMP safe. */
    if (copy_from_user(a, args, len))
        return -EFAULT;

    audit_socketcall(nargs[call] / sizeof(unsigned long), a);

    a0 = a[0];
    a1 = a[1];

    /*
    进入内核的sys_socketcall()系统调用后,linux系统需要将特定的调用分路发送到指定目标,例如
        1) sockcet()调用将由sys_socket()完成 
        2) send()调用将由sys_send()完成
    系统在include/linux/net.h中定义了以下17种类型socketcall
        1) #define SYS_SOCKET        1: sys_socket(2)     
        2) #define SYS_BIND            2: sys_bind(2)     
        3) #define SYS_CONNECT        3: sys_connect(2) 
        4) #define SYS_LISTEN        4: sys_listen(2)     
        5) #define SYS_ACCEPT        5: sys_accept(2)         
        6) #define SYS_GETSOCKNAME    6: sys_getsockname(2) 
        7) #define SYS_GETPEERNAME    7: sys_getpeername(2)     
        8) #define SYS_SOCKETPAIR    8: sys_socketpair(2)     
        9) #define SYS_SEND            9: sys_send(2)             
        10) #define SYS_RECV        10: sys_recv(2)             
        11) #define SYS_SENDTO        11: sys_sendto(2)     
        12) #define SYS_RECVFROM    12: sys_recvfrom(2)         
        13) #define SYS_SHUTDOWN    13: sys_shutdown(2)         
        14) #define SYS_SETSOCKOPT    14: sys_setsockopt(2)         
        15) #define SYS_GETSOCKOPT    15: sys_getsockopt(2)         
        16) #define SYS_SENDMSG        16: sys_sendmsg(2)         
        17) #define SYS_RECVMSG        17: sys_recvmsg(2)         
        18) #define SYS_ACCEPT4        18: sys_accept4(2)         
    */
    switch (call) SYS_SOCKET
    {
    case :
        err = sys_socket(a0, a1, a[2]);
        break;
    /*
    等效于调用
    SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
    \linux-2.6.32.63\net\socket.c
    */
    case SYS_BIND:
        err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_CONNECT:
        err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_LISTEN: 
        err = sys_listen(a0, a1);
        break;
    case SYS_ACCEPT:
        err = sys_accept4(a0, (struct sockaddr __user *)a1,
                  (int __user *)a[2], 0);
        break;
    case SYS_GETSOCKNAME:
        err =
            sys_getsockname(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2]);
        break;
    case SYS_GETPEERNAME:
        err =
            sys_getpeername(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2]);
        break;
    case SYS_SOCKETPAIR:
        err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
        break;
    case SYS_SEND:
        err = sys_send(a0, (void __user *)a1, a[2], a[3]);
        break; 
    case SYS_SENDTO:
        err = sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]);
        break;
    case SYS_RECV:
        err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
        break;
    case SYS_RECVFROM:
        err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                   (struct sockaddr __user *)a[4],
                   (int __user *)a[5]);
        break;
    case SYS_SHUTDOWN:
        err = sys_shutdown(a0, a1);
        break;
    case SYS_SETSOCKOPT:
        err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
        break;
    case SYS_GETSOCKOPT:
        err =
            sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                   (int __user *)a[4]);
        break;
    case SYS_SENDMSG:
        err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]);
        break;
    case SYS_RECVMSG:
        err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
        break;
    case SYS_ACCEPT4:
        err = sys_accept4(a0, (struct sockaddr __user *)a1,
                  (int __user *)a[2], a[3]);
        break;
    default:
        err = -EINVAL;
        break;
    }
    return err;
}


/*
继续跟进
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
1. sys_bind()首先调用sockfd_lookup_light()查找套接字对应的socket实例,如果没有找到,则返回EBADF错误
2. 在进行绑定操作之前,要先将用户传入的本地协议地址从用户空间拷贝到内核缓冲区中,在拷贝过程中会检查用户传入的地址是否正确。如果指定的长度参数小于0或者大于sockaddr_storage的大小,则返回EINVAL错误
3. 如果在调用copy_from_user()执行拷贝操作过程中出现错误,则返回EFAULT错误
4. 在上述的准备工作都完成后,调用inet_bind()函数(即sock->ops->bind指向的函数)来完成绑定操作
\linux-2.6.32.63\net\socket.c
*/
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
    struct socket *sock;
    struct sockaddr_storage address;
    int err, fput_needed;
    /* 
    1. 以fd为索引从当前进程的文件描述符表中 
    2. 找到对应的file实例,然后从file实例的private_data中 
    3. 获取socket实例
    */  
    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (sock) 
    {
        /* 
        将用户空间的地址拷贝到内核空间的缓冲区中。 
        */ 
        err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
        if (err >= 0) 
        {
            /* 
            SELinux相关 
            */
            err = security_socket_bind(sock,
                           (struct sockaddr *)&address,
                           addrlen);
            /* 
            如果是TCP套接字,sock->ops指向的是inet_stream_ops, sock->ops是在inet_create()函数中初始化,所以bind接口调用的是inet_bind()函数。 
            继续跟进inet_bind()
            \linux-2.6.32.63\net\ipv4\af_inet.c
            */  
            if (!err)
                err = sock->ops->bind(sock, (struct sockaddr *)&address, addrlen);
        }
        fput_light(sock->file, fput_needed);
    }
    return err;
}


/*
继续跟进inet_bind()
\linux-2.6.32.63\net\ipv4\af_inet.c
*/
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
    struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
    struct sock *sk = sock->sk;
    struct inet_sock *inet = inet_sk(sk);
    unsigned short snum;
    int chk_addr_ret;
    int err;

    /* 
    If the socket has its own bind function then use it. (RAW)  
    如果是TCP套接字,sk->sk_prot指向的是tcp_prot,在inet_create()中调用的sk_alloc()函数中初始化
    由于tcp_prot中没有设置bind接口,因此判断条件不成立
     */ 
    if (sk->sk_prot->bind) {
        err = sk->sk_prot->bind(sk, uaddr, addr_len);
        goto out;
    }
    err = -EINVAL;
    if (addr_len < sizeof(struct sockaddr_in))
        goto out;

    if (addr->sin_family != AF_INET)
        goto out;
    /* 
    判断传入的地址类型
    */
    chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);

    /* Not specified by any standard per-se, however it breaks too
     * many applications when removed.  It is unfortunate since
     * allowing applications to make a non-local bind solves
     * several problems with systems using dynamic addressing.
     * (ie. your servers still start up even if your ISDN link
     *  is temporarily down)
     */
    err = -EADDRNOTAVAIL;
    /* 
    如果系统不支持绑定本地地址,或者 
    传入的地址类型有误,则返回EADDRNOTAVAIL错误 
    */  
    if (!sysctl_ip_nonlocal_bind &&
        !(inet->freebind || inet->transparent) &&
        addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
        chk_addr_ret != RTN_LOCAL &&
        chk_addr_ret != RTN_MULTICAST &&
        chk_addr_ret != RTN_BROADCAST)
        goto out;

    snum = ntohs(addr->sin_port);
    err = -EACCES;
    /* 
    如果绑定的端口号小于1024(保留端口号),但是当前用户没有CAP_NET_BIND_SERVICE权限,则返回EACCESS错误
    */  
    if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
        goto out;

    /*      We keep a pair of addresses. rcv_saddr is the one
     *      used by hash lookups, and saddr is used for transmit.
     *
     *      In the BSD API these are the same except where it
     *      would be illegal to use them (multicast/broadcast) in
     *      which case the sending device address is used.
     */
    lock_sock(sk);

    /* 
    如果套接字状态不是TCP_CLOSE(套接字的初始状态,参见sock_init_data()函数)
    或者已经绑定过,则返回EINVAL错误 
     */ 
    err = -EINVAL;
    if (sk->sk_state != TCP_CLOSE || inet->num)
        goto out_release_sock;

    inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
    if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
        inet->saddr = 0;  /* Use device */

    /* 
    这里实际调用的是inet_csk_get_port()函数。 
    1. 检查要绑定的端口号是否已经使用,如果已经使用, 
    2. 则检查是否允许复用。如果检查失败,则返回EADDRINUSE错误
    */  
    if (sk->sk_prot->get_port(sk, snum)) {
        inet->saddr = inet->rcv_saddr = 0;
        err = -EADDRINUSE;
        goto out_release_sock;
    }
    /* 
    rcv_saddr存储的是已绑定的本地地址,接收数据时使用。 
    如果已绑定的地址不为0,则设置SOCK_BINDADDR_LOCK标志,表示已绑定本地地址
     */ 
    if (inet->rcv_saddr)
        sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
    /* 
    如果绑定的端口号不为0,则设置SOCK_BINDPORT_LOCK标志,表示已绑定本地端口号。 
    */  
    if (snum)
        sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
    inet->sport = htons(inet->num);
    inet->daddr = 0;
    inet->dport = 0;
    /* 
    重新初始化目的路由缓存项,如果之前已设置,则调用dst_release()释放老的路由缓存项
    */ 
    sk_dst_reset(sk);
    err = 0;
out_release_sock:
    release_sock(sk);
out:
    return err;
}

Relevant Link:

http://blog.csdn.net/justlinux2010/article/details/8593539

 

Copyright (c) 2014 LittleHann All rights reserved

 

TCP Socket Establish;UDP Send Package Process In Kernel Sourcecode Learning,布布扣,bubuko.com

TCP Socket Establish;UDP Send Package Process In Kernel Sourcecode Learning

原文:http://www.cnblogs.com/LittleHann/p/3875451.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!