Python urllib2 模块请求超时的底层实现

时间：2021-07-22 11:10:24 阅读：27 评论：0 收藏：0 [点我收藏+]

前言

最近，笔者在使用 Python2.7 的 urllib2 模块进行 HTTP 相关操作时，遇到请求超时的问题。在创建 urllib2 模块的 Request 对象时，可以通过参数 timeout 指定超时时间：

request = urllib2.Request("http://www.baidu.com")
response = urllib2.urlopen(request, timeout=8)

然而，这里指定的超时时间到底是指什么？是约定时间内没有完成 IO 操作？还是约定时间内无 IO 事件产生？

socket 如何设置超时

在 Linux 环境中，对于一个 Berkeley 套接字，设置超时的方式有三种：

通过 alarm 函数设置超时，当超时时产生 SIGALRM 信号；
通过 setsockop t函数和套接字选项 SO_RCVTIMEO、SO_SNDTIMEO，来设置 recv()/send() 的超时时间；
通过非阻塞 socket 与 IO 多路复用。

urllib2 模块如何实现超时

connect 超时

在上文中提到，可以通过 urllib2.urlopen 函数来设置一个 HTTP 请求对象的超时时间。而在 urllib2 模块中，一个请求对象如何与 socket 关联起来呢？笔者在 socket 模块的 create_connection 函数中加入堆栈打印，输出如下：

File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
    return opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 429, in open
    response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 447, in _open
    ‘_open‘, req)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
    result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1228, in http_open
    return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1195, in do_open
    h.request(req.get_method(), req.get_selector(), req.data, headers)
File "/usr/lib/python2.7/httplib.py", line 1057, in request
    self._send_request(method, url, body, headers)
File "/usr/lib/python2.7/httplib.py", line 1097, in _send_request
    self.endheaders(body)
File "/usr/lib/python2.7/httplib.py", line 1053, in endheaders
    self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 897, in _send_output
    self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 859, in send
    self.connect()
File "/usr/lib/python2.7/httplib.py", line 836, in connect
    self.timeout, self.source_address)
File "/usr/lib/python2.7/socket.py", line 557, in create_connection
    traceback.print_stack()

可见，urllib2.urlopen() 最终会调用 socket.create_connection() 来创建 socket 并建立 TCP 连接：

def create_connection(address, timeout=_GLOBAL_DEFAULT_TIMEOUT,
                      source_address=None):
    host, port = address
    err = None
    for res in getaddrinfo(host, port, 0, SOCK_STREAM):
        af, socktype, proto, canonname, sa = res
        sock = None
        try:
            sock = socket(af, socktype, proto)
            if timeout is not _GLOBAL_DEFAULT_TIMEOUT:
                sock.settimeout(timeout)
            if source_address:
                sock.bind(source_address)
            sock.connect(sa)
            return sock

        except error as _:
            err = _
            if sock is not None:
                sock.close()

    if err is not None:
        raise err
    else:
        raise error("getaddrinfo returns an empty list")

在上述代码中，程序首先创建 socket 对象，紧接着根据实参来设置 socket 的超时。socket.settimeout 函数的底层实现如下（仅列出 Linux 下的实现）：

static PyObject *
sock_settimeout(PySocketSockObject *s, PyObject *arg)
{
    double timeout;

    if (arg == Py_None)
        timeout = -1.0;
    else {
        timeout = PyFloat_AsDouble(arg);
        if (timeout < 0.0) {
            if (!PyErr_Occurred())
                PyErr_SetString(PyExc_ValueError,
                                "Timeout value out of range");
            return NULL;
        }
    }

    s->sock_timeout = timeout;
    internal_setblocking(s, timeout < 0.0);

    Py_INCREF(Py_None);
    return Py_None;
}


static int
internal_setblocking(PySocketSockObject *s, int block)
{
#ifndef RISCOS
#ifndef MS_WINDOWS
    int delay_flag;
#endif
#endif

    Py_BEGIN_ALLOW_THREADS
#ifdef __BEOS__
    block = !block;
    setsockopt(s->sock_fd, SOL_SOCKET, SO_NONBLOCK,
               (void *)(&block), sizeof(int));
#else
#ifndef RISCOS
#ifndef MS_WINDOWS
#if defined(PYOS_OS2) && !defined(PYCC_GCC)
	//省略
#elif defined(__VMS)
    //省略
#else  /* !PYOS_OS2 && !__VMS */
    delay_flag = fcntl(s->sock_fd, F_GETFL, 0);
    if (block)
        delay_flag &= (~O_NONBLOCK);
    else
        delay_flag |= O_NONBLOCK;
    fcntl(s->sock_fd, F_SETFL, delay_flag);
#endif /* !PYOS_OS2 */
#else /* MS_WINDOWS */
    //省略
#endif /* MS_WINDOWS */
#else /* RISCOS */
   //省略
#endif /* RISCOS */
#endif /* __BEOS__ */
    Py_END_ALLOW_THREADS

    /* Since these don‘t return anything */
    return 1;
}

可见，urllib2模块创建一个具有超时时间的请求对象，在Linux下，其底层是通过fcntl来设置相应socket为非阻塞模式来实现的。

完成超时设置后，程序会调用socket.connect()来建立TCP连接，该函数的底层实现如下：

static PyObject *
sock_connect(PySocketSockObject *s, PyObject *addro)
{
    sock_addr_t addrbuf;
    int addrlen;
    int res;
    int timeout;

    if (!getsockaddrarg(s, addro, SAS2SA(&addrbuf), &addrlen))
        return NULL;

    Py_BEGIN_ALLOW_THREADS
    res = internal_connect(s, SAS2SA(&addrbuf), addrlen, &timeout);
    Py_END_ALLOW_THREADS

    if (timeout == 1) {
        PyErr_SetString(socket_timeout, "timed out");
        return NULL;
    }
    if (res != 0)
        return s->errorhandler();
    Py_INCREF(Py_None);
    return Py_None;
}

其中核心函数internal_connect的源码如下（仅列出Linux下的实现）：

static int
internal_connect(PySocketSockObject *s, struct sockaddr *addr, int addrlen,
                 int *timeoutp)
{
    int res, timeout;

    timeout = 0;
    res = connect(s->sock_fd, addr, addrlen);

#ifdef MS_WINDOWS
	//省略
#else
    
    if (s->sock_timeout > 0.0) {
        if (res < 0 && errno == EINPROGRESS && IS_SELECTABLE(s)) {
            timeout = internal_select(s, 1);
            if (timeout == 0) {
                /* Bug #1019808: in case of an EINPROGRESS,
                   use getsockopt(SO_ERROR) to get the real
                   error. */
                socklen_t res_size = sizeof res;
                (void)getsockopt(s->sock_fd, SOL_SOCKET,
                                 SO_ERROR, &res, &res_size);
                if (res == EISCONN)
                    res = 0;
                errno = res;
            }
            else if (timeout == -1) {
                res = errno;            /* had error */
            }
            else
                res = EWOULDBLOCK;                      /* timed out */
        }
    }

    if (res < 0)
        res = errno;

#endif
    *timeoutp = timeout;

    return res;
}

internal_connect 函数的工作流程可以概括为：

调用 connect() 来尝试建立连接；
connect() 调用结束，如果 socket 没有设置超时时间，则判断 connect 函数的返回值，若返回值小于 0（出错），则返回错误码 errno；
如果 socket 设置了超时时间（s->sock_timeout > 0.0），则说明该 socket 为非阻塞模式，那么接下来的工作其实就是非阻塞 connect 的流程：
- 判断 errno 是否为 EINPROGRESS（TCP 三次握手正在进行中）；
- 若是，通过 IO 多路复用（select、poll）来检测 socket 是否可写；
- 若发生可写事件，再调用 getsockopt() 来检查 socket 是否出错；
- 若无错误发生，则表明该 socket 的非阻塞 connect 已完成，TCP 连接已建立。

不信，见 internal_select 函数的实现：

static int
internal_select(PySocketSockObject *s, int writing)
{
    int n;

    /* Nothing to do unless we‘re in timeout mode (not non-blocking) */
    if (s->sock_timeout <= 0.0)
        return 0;

    /* Guard against closed socket */
    if (s->sock_fd < 0)
        return 0;

    /* Prefer poll, if available, since you can poll() any fd
     * which can‘t be done with select(). */
#ifdef HAVE_POLL
    {
        struct pollfd pollfd;
        int timeout;

        pollfd.fd = s->sock_fd;
        pollfd.events = writing ? POLLOUT : POLLIN;

        /* s->sock_timeout is in seconds, timeout in ms */
        timeout = (int)(s->sock_timeout * 1000 + 0.5);
        n = poll(&pollfd, 1, timeout);
    }
#else
    {
        /* Construct the arguments to select */
        fd_set fds;
        struct timeval tv;
        tv.tv_sec = (int)s->sock_timeout;
        tv.tv_usec = (int)((s->sock_timeout - tv.tv_sec) * 1e6);
        FD_ZERO(&fds);
        FD_SET(s->sock_fd, &fds);

        /* See if the socket is ready */
        if (writing)
            n = select(s->sock_fd+1, NULL, &fds, NULL, &tv);
        else
            n = select(s->sock_fd+1, &fds, NULL, NULL, &tv);
    }
#endif

    if (n < 0)
        return -1;
    if (n == 0)
        return 1;
    return 0;
}

从上述代码中可见，若 select() 等待可写事件超时，sock_connect 函数则会调用 PyErr_SetString() 来设置异常对象，以提示 Python 层 connect() 超时：

if (timeout == 1) {
	PyErr_SetString(socket_timeout, "timed out");
	return NULL;
}

recv/send 超时

当 TCP 连接成功建立之后，接下来需要处理应用层上的网络数据交互。对应到 urllib2 模块，就是如 HTTP 请求与响应等操作。而这些操作，都是通过调用底层的 sock_recv()、sock_send() 等来实现的。

以 sock_recv 函数为例：

static PyObject *
sock_recv(PySocketSockObject *s, PyObject *args)
{
    int recvlen, flags = 0;
    ssize_t outlen;
    PyObject *buf;

    if (!PyArg_ParseTuple(args, "i|i:recv", &recvlen, &flags))
        return NULL;

    if (recvlen < 0) {
        PyErr_SetString(PyExc_ValueError,
                        "negative buffersize in recv");
        return NULL;
    }

    /* Allocate a new string. */
    buf = PyString_FromStringAndSize((char *) 0, recvlen);
    if (buf == NULL)
        return NULL;

    /* Call the guts */
    outlen = sock_recv_guts(s, PyString_AS_STRING(buf), recvlen, flags);
    if (outlen < 0) {
        /* An error occurred, release the string and return an
           error. */
        Py_DECREF(buf);
        return NULL;
    }
    if (outlen != recvlen) {
        /* We did not read as many bytes as we anticipated, resize the
           string if possible and be successful. */
        if (_PyString_Resize(&buf, outlen) < 0)
            /* Oopsy, not so successful after all. */
            return NULL;
    }

    return buf;
}

其核心函数 sock_recv_guts() 的实现如下：

static ssize_t
sock_recv_guts(PySocketSockObject *s, char* cbuf, int len, int flags)
{
    ssize_t outlen = -1;
    int timeout;
#ifdef __VMS
    int remaining;
    char *read_buf;
#endif

    if (!IS_SELECTABLE(s)) {
        select_error();
        return -1;
    }

#ifndef __VMS
    Py_BEGIN_ALLOW_THREADS
    timeout = internal_select(s, 0);
    if (!timeout)
        outlen = recv(s->sock_fd, cbuf, len, flags);
    Py_END_ALLOW_THREADS

    if (timeout == 1) {
        PyErr_SetString(socket_timeout, "timed out");
        return -1;
    }
    if (outlen < 0) {
        /* Note: the call to errorhandler() ALWAYS indirectly returned
           NULL, so ignore its return value */
        s->errorhandler();
        return -1;
    }
#else
    read_buf = cbuf;
    remaining = len;
    while (remaining != 0) {
        unsigned int segment;
        int nread = -1;

        segment = remaining /SEGMENT_SIZE;
        if (segment != 0) {
            segment = SEGMENT_SIZE;
        }
        else {
            segment = remaining;
        }

        Py_BEGIN_ALLOW_THREADS
        timeout = internal_select(s, 0);
        if (!timeout)
            nread = recv(s->sock_fd, read_buf, segment, flags);
        Py_END_ALLOW_THREADS

        if (timeout == 1) {
            PyErr_SetString(socket_timeout, "timed out");
            return -1;
        }
        if (nread < 0) {
            s->errorhandler();
            return -1;
        }
        if (nread != remaining) {
            read_buf += nread;
            break;
        }

        remaining -= segment;
        read_buf += segment;
    }
    outlen = read_buf - cbuf;
#endif /* !__VMS */

    return outlen;
}

可见，程序首先调用 internal_select() 来检测可读事件。若有可读事件发生，则调用 recv() 进行数据读取（当然未必能读取完整）；若超时，则同样调用 PyErr_SetString() 来设置超时的异常对象。

总结

当使用 urllib2 模块创建一个具有超时时间的 HTTP 请求时，其实质是创建一个非阻塞 socket；当对这个 HTTP 请求进行 IO 操作时（如建立 TCP 连接、收发数据），若发生了超时，则表明在超时时间内，没有产生相应的 IO 事件，而非“在超时时间内没有完成 IO 操作”。

Python urllib2 模块请求超时的底层实现

原文：https://www.cnblogs.com/crezov/p/python-urllib2-timeout-implementation.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)