最近,笔者在使用 Python2.7 的 urllib2 模块进行 HTTP 相关操作时,遇到请求超时的问题。在创建 urllib2 模块的 Request 对象时,可以通过参数 timeout
指定超时时间:
request = urllib2.Request("http://www.baidu.com")
response = urllib2.urlopen(request, timeout=8)
然而,这里指定的超时时间到底是指什么?是约定时间内没有完成 IO 操作?还是约定时间内无 IO 事件产生?
在 Linux 环境中,对于一个 Berkeley 套接字,设置超时的方式有三种:
在上文中提到,可以通过 urllib2.urlopen 函数来设置一个 HTTP 请求对象的超时时间。而在 urllib2 模块中,一个请求对象如何与 socket 关联起来呢?笔者在 socket 模块的 create_connection 函数中加入堆栈打印,输出如下:
File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 429, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 447, in _open
‘_open‘, req)
File "/usr/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1228, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1195, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "/usr/lib/python2.7/httplib.py", line 1057, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python2.7/httplib.py", line 1097, in _send_request
self.endheaders(body)
File "/usr/lib/python2.7/httplib.py", line 1053, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 897, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 859, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 836, in connect
self.timeout, self.source_address)
File "/usr/lib/python2.7/socket.py", line 557, in create_connection
traceback.print_stack()
可见,urllib2.urlopen() 最终会调用 socket.create_connection() 来创建 socket 并建立 TCP 连接:
def create_connection(address, timeout=_GLOBAL_DEFAULT_TIMEOUT,
source_address=None):
host, port = address
err = None
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
af, socktype, proto, canonname, sa = res
sock = None
try:
sock = socket(af, socktype, proto)
if timeout is not _GLOBAL_DEFAULT_TIMEOUT:
sock.settimeout(timeout)
if source_address:
sock.bind(source_address)
sock.connect(sa)
return sock
except error as _:
err = _
if sock is not None:
sock.close()
if err is not None:
raise err
else:
raise error("getaddrinfo returns an empty list")
在上述代码中,程序首先创建 socket 对象,紧接着根据实参来设置 socket 的超时。socket.settimeout 函数的底层实现如下(仅列出 Linux 下的实现):
static PyObject *
sock_settimeout(PySocketSockObject *s, PyObject *arg)
{
double timeout;
if (arg == Py_None)
timeout = -1.0;
else {
timeout = PyFloat_AsDouble(arg);
if (timeout < 0.0) {
if (!PyErr_Occurred())
PyErr_SetString(PyExc_ValueError,
"Timeout value out of range");
return NULL;
}
}
s->sock_timeout = timeout;
internal_setblocking(s, timeout < 0.0);
Py_INCREF(Py_None);
return Py_None;
}
static int
internal_setblocking(PySocketSockObject *s, int block)
{
#ifndef RISCOS
#ifndef MS_WINDOWS
int delay_flag;
#endif
#endif
Py_BEGIN_ALLOW_THREADS
#ifdef __BEOS__
block = !block;
setsockopt(s->sock_fd, SOL_SOCKET, SO_NONBLOCK,
(void *)(&block), sizeof(int));
#else
#ifndef RISCOS
#ifndef MS_WINDOWS
#if defined(PYOS_OS2) && !defined(PYCC_GCC)
//省略
#elif defined(__VMS)
//省略
#else /* !PYOS_OS2 && !__VMS */
delay_flag = fcntl(s->sock_fd, F_GETFL, 0);
if (block)
delay_flag &= (~O_NONBLOCK);
else
delay_flag |= O_NONBLOCK;
fcntl(s->sock_fd, F_SETFL, delay_flag);
#endif /* !PYOS_OS2 */
#else /* MS_WINDOWS */
//省略
#endif /* MS_WINDOWS */
#else /* RISCOS */
//省略
#endif /* RISCOS */
#endif /* __BEOS__ */
Py_END_ALLOW_THREADS
/* Since these don‘t return anything */
return 1;
}
可见,urllib2模块创建一个具有超时时间的请求对象,在Linux下,其底层是通过fcntl来设置相应socket为非阻塞模式来实现的。
完成超时设置后,程序会调用socket.connect()来建立TCP连接,该函数的底层实现如下:
static PyObject *
sock_connect(PySocketSockObject *s, PyObject *addro)
{
sock_addr_t addrbuf;
int addrlen;
int res;
int timeout;
if (!getsockaddrarg(s, addro, SAS2SA(&addrbuf), &addrlen))
return NULL;
Py_BEGIN_ALLOW_THREADS
res = internal_connect(s, SAS2SA(&addrbuf), addrlen, &timeout);
Py_END_ALLOW_THREADS
if (timeout == 1) {
PyErr_SetString(socket_timeout, "timed out");
return NULL;
}
if (res != 0)
return s->errorhandler();
Py_INCREF(Py_None);
return Py_None;
}
其中核心函数internal_connect的源码如下(仅列出Linux下的实现):
static int
internal_connect(PySocketSockObject *s, struct sockaddr *addr, int addrlen,
int *timeoutp)
{
int res, timeout;
timeout = 0;
res = connect(s->sock_fd, addr, addrlen);
#ifdef MS_WINDOWS
//省略
#else
if (s->sock_timeout > 0.0) {
if (res < 0 && errno == EINPROGRESS && IS_SELECTABLE(s)) {
timeout = internal_select(s, 1);
if (timeout == 0) {
/* Bug #1019808: in case of an EINPROGRESS,
use getsockopt(SO_ERROR) to get the real
error. */
socklen_t res_size = sizeof res;
(void)getsockopt(s->sock_fd, SOL_SOCKET,
SO_ERROR, &res, &res_size);
if (res == EISCONN)
res = 0;
errno = res;
}
else if (timeout == -1) {
res = errno; /* had error */
}
else
res = EWOULDBLOCK; /* timed out */
}
}
if (res < 0)
res = errno;
#endif
*timeoutp = timeout;
return res;
}
internal_connect 函数的工作流程可以概括为:
s->sock_timeout > 0.0
),则说明该 socket 为非阻塞模式,那么接下来的工作其实就是非阻塞 connect 的流程:
不信,见 internal_select 函数的实现:
static int
internal_select(PySocketSockObject *s, int writing)
{
int n;
/* Nothing to do unless we‘re in timeout mode (not non-blocking) */
if (s->sock_timeout <= 0.0)
return 0;
/* Guard against closed socket */
if (s->sock_fd < 0)
return 0;
/* Prefer poll, if available, since you can poll() any fd
* which can‘t be done with select(). */
#ifdef HAVE_POLL
{
struct pollfd pollfd;
int timeout;
pollfd.fd = s->sock_fd;
pollfd.events = writing ? POLLOUT : POLLIN;
/* s->sock_timeout is in seconds, timeout in ms */
timeout = (int)(s->sock_timeout * 1000 + 0.5);
n = poll(&pollfd, 1, timeout);
}
#else
{
/* Construct the arguments to select */
fd_set fds;
struct timeval tv;
tv.tv_sec = (int)s->sock_timeout;
tv.tv_usec = (int)((s->sock_timeout - tv.tv_sec) * 1e6);
FD_ZERO(&fds);
FD_SET(s->sock_fd, &fds);
/* See if the socket is ready */
if (writing)
n = select(s->sock_fd+1, NULL, &fds, NULL, &tv);
else
n = select(s->sock_fd+1, &fds, NULL, NULL, &tv);
}
#endif
if (n < 0)
return -1;
if (n == 0)
return 1;
return 0;
}
从上述代码中可见,若 select() 等待可写事件超时,sock_connect 函数则会调用 PyErr_SetString() 来设置异常对象,以提示 Python 层 connect() 超时:
if (timeout == 1) {
PyErr_SetString(socket_timeout, "timed out");
return NULL;
}
当 TCP 连接成功建立之后,接下来需要处理应用层上的网络数据交互。对应到 urllib2 模块,就是如 HTTP 请求与响应等操作。而这些操作,都是通过调用底层的 sock_recv()、sock_send() 等来实现的。
以 sock_recv 函数为例:
static PyObject *
sock_recv(PySocketSockObject *s, PyObject *args)
{
int recvlen, flags = 0;
ssize_t outlen;
PyObject *buf;
if (!PyArg_ParseTuple(args, "i|i:recv", &recvlen, &flags))
return NULL;
if (recvlen < 0) {
PyErr_SetString(PyExc_ValueError,
"negative buffersize in recv");
return NULL;
}
/* Allocate a new string. */
buf = PyString_FromStringAndSize((char *) 0, recvlen);
if (buf == NULL)
return NULL;
/* Call the guts */
outlen = sock_recv_guts(s, PyString_AS_STRING(buf), recvlen, flags);
if (outlen < 0) {
/* An error occurred, release the string and return an
error. */
Py_DECREF(buf);
return NULL;
}
if (outlen != recvlen) {
/* We did not read as many bytes as we anticipated, resize the
string if possible and be successful. */
if (_PyString_Resize(&buf, outlen) < 0)
/* Oopsy, not so successful after all. */
return NULL;
}
return buf;
}
其核心函数 sock_recv_guts() 的实现如下:
static ssize_t
sock_recv_guts(PySocketSockObject *s, char* cbuf, int len, int flags)
{
ssize_t outlen = -1;
int timeout;
#ifdef __VMS
int remaining;
char *read_buf;
#endif
if (!IS_SELECTABLE(s)) {
select_error();
return -1;
}
#ifndef __VMS
Py_BEGIN_ALLOW_THREADS
timeout = internal_select(s, 0);
if (!timeout)
outlen = recv(s->sock_fd, cbuf, len, flags);
Py_END_ALLOW_THREADS
if (timeout == 1) {
PyErr_SetString(socket_timeout, "timed out");
return -1;
}
if (outlen < 0) {
/* Note: the call to errorhandler() ALWAYS indirectly returned
NULL, so ignore its return value */
s->errorhandler();
return -1;
}
#else
read_buf = cbuf;
remaining = len;
while (remaining != 0) {
unsigned int segment;
int nread = -1;
segment = remaining /SEGMENT_SIZE;
if (segment != 0) {
segment = SEGMENT_SIZE;
}
else {
segment = remaining;
}
Py_BEGIN_ALLOW_THREADS
timeout = internal_select(s, 0);
if (!timeout)
nread = recv(s->sock_fd, read_buf, segment, flags);
Py_END_ALLOW_THREADS
if (timeout == 1) {
PyErr_SetString(socket_timeout, "timed out");
return -1;
}
if (nread < 0) {
s->errorhandler();
return -1;
}
if (nread != remaining) {
read_buf += nread;
break;
}
remaining -= segment;
read_buf += segment;
}
outlen = read_buf - cbuf;
#endif /* !__VMS */
return outlen;
}
可见,程序首先调用 internal_select() 来检测可读事件。若有可读事件发生,则调用 recv() 进行数据读取(当然未必能读取完整);若超时,则同样调用 PyErr_SetString() 来设置超时的异常对象。
当使用 urllib2 模块创建一个具有超时时间的 HTTP 请求时,其实质是创建一个非阻塞 socket;当对这个 HTTP 请求进行 IO 操作时(如建立 TCP 连接、收发数据),若发生了超时,则表明在超时时间内,没有产生相应的 IO 事件,而非“在超时时间内没有完成 IO 操作”。
原文:https://www.cnblogs.com/crezov/p/python-urllib2-timeout-implementation.html