上篇分析的是wget参数解析,本篇主要是分析wget非递归下载html或者文件。wget实际上就是通过sock 向web服务器发送http数据包(GET or POST),web服务器收到请求后,发回回复包给wget,当然了http 传输层是tcp协议,简单来说wget 发tcp包,发送内容符合http协议,web服务器解析(such as nginx、apache)请求包,针对请求回复回复包。so easy.
整个wget可以用下面的流程图概括
其中config_analysis已经在前一篇已经分析过了,本篇就是分析wget后面的实现。
我们仅仅是不加任何参数的非递归下载,也就是执行如下命令:
wget www.baidu.com/index.html
之前分析的是参数解析篇,参数解析完之后,会对参数进行校验。此段代码略过
nurl=argc –optind //参数解析完后,获取用户下载url个数,因为执行的命令是
wget www.baidu.com/index.html所以 nurl==1
//code1: url = alloca_array (char *, nurl + 1); for (i = 0; i < nurl; i++, optind++) { char *rewritten = rewrite_shorthand_url (argv[optind]);//为url增加http:// if (rewritten) url[i] = rewritten; else url[i] = xstrdup (argv[optind]); }
url[i] = NULL;//设置url最后元素为空,作为标记变量
上面那块代码主要是为url分配内存,分配nurl+1枚个元素的char*数组。
函数rewrite_shorthand_url (argv[optind]) 主要是为url添加http://字段,支持用户不输入协议,wget支持http、https、ftp协议,如果用户没输入协议,默认http://。并且url最后一个元素置为NULL,作为标志位。
//code2部分代码: for (t = url; *t; t++) { char *filename = NULL, *redirected_URL = NULL; int dt, url_err; /* Need to do a new struct iri every time, because * retrieve_url may modify it in some circumstances, * currently. */ struct iri *iri = iri_new (); struct url *url_parsed; set_uri_encoding (iri, opt.locale, true); /*对url进行解析*/ url_parsed = url_parse (*t, &url_err, iri, true); if (!url_parsed) { char *error = url_error (*t, url_err); logprintf (LOG_NOTQUIET, "%s: %s.\n",*t, error); xfree (error); inform_exit_status (URLERROR); } else { /*如果是递归or需要页面css js之类的,并且不是ftp协议*/ if ((opt.recursive || opt.page_requisites) && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (url_parsed))) /*此case为递归url下载*/ { int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ if (url_scheme (*t) == SCHEME_FTP) opt.follow_ftp = 1; retrieve_tree (url_parsed, NULL); opt.follow_ftp = old_follow_ftp; } else { /*此处为非递归url下载*/ retrieve_url (url_parsed, *t, &filename, &redirected_URL, NULL, &dt, opt.recursive, iri, true); }
代码遍历url,对用户的每一个url都进行下载。
函数set_uri_encoding主要是对url进行解析一个正常的url是如下格式:
scheme://host[:port][/path][;params][?query][#fragment],此函数就是对url解析出来每一个结构。如下:
url : http://www.baidu.com/index.html scheme: SCHEME_HTTP host: www.baidu.com port: 80 path: index.html params: NULL query: NULL fragment: NULL file: index.html user: NULL passwd: NULL
同时会对url进行utf-8编码。
会根据用户参数来决定是递归下载or非递归下载
递归下载条件:(用户输入-r or –p) && (not ftp协议 or use_proxy)
因为我们是直接下载,所以会跳到
retrieve_url (url_parsed, *t,&filename, &redirected_URL, NULL,
&dt, opt.recursive, iri, true);
//如果使用proxy会设置一些属性,因为没有用proxy所以跳过了。
result = http_loop (u, orig_parsed,&mynewloc, &local_file, refurl, dt, proxy_url, iri);
参数说明:
u和orig_parsed是属性是相同值
mynewloc 指向NULL。
local_file 指向NULL。
refurl指向NULL。
dt 为 -1。
proxy_url 指向NULL。
iri为上层分析的那个iri,包括编码方式。
//code hstat.referer = referer;//设置referer,此时的referer为NULL //保存文件名称 首先是通过 --output-document 如果没有就获取url后缀名称 if (opt.output_document) { hstat.local_file = xstrdup (opt.output_document); got_name = true; } else if (!opt.content_disposition) { hstat.local_file = url_file_name (opt.trustservernames ? u : original_url, NULL); /*此函数主要是如果u->file如果存在,会生成一个新的文件名file_1…如果是设置 了clobber就会覆盖*/ got_name = true; }
req = request_new ();//构造一个req头 static struct request * request_new (void) { struct request *req = xnew0 (struct request);//分配request结构 req->hcapacity = 8;//初始化http头部数组为8个 req->headers = xnew_array (struct request_header, req->hcapacity);//分配 return req; }
下面是请求结构
struct request { const char *method;//请求方法 char *arg; //请求内容 /*此结构保存http header的key和value,比如content-length:xxxx Key为content-length Value为xxx */ struct request_header { char *name, *value; enum rp release_policy; } *headers; int hcount; int hcapacity; //此头部容量 };
设置http方法
request_set_method(req, meth, meth_arg) { req->method = meth; req->arg = arg; }
设置http header
static void request_set_header (struct request *req, char *name, char *value, enum rp release_policy) { struct request_header *hdr; int i; if (!value) { /* A NULL value is a no-op; if freeing the name is requested, free it now to avoid leaks. */ if (release_policy == rel_name || release_policy == rel_both) xfree (name); return; } //首先是遍历所有头部,如果说找到的话,就释放设置成新的头 for (i = 0; i < req->hcount; i++) { hdr = &req->headers[i]; if (0 == strcasecmp (name, hdr->name)) { /* Replace existing header. */ release_header (hdr); hdr->name = name; hdr->value = value; hdr->release_policy = release_policy; return; } } //如果用户设置的头很多,超过了8个就重新分配 2的幂增长 if (req->hcount >= req->hcapacity) { req->hcapacity <<= 1; req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr)); } hdr = &req->headers[req->hcount++]; hdr->name = name; hdr->value = value; hdr->release_policy = release_policy; }
后面设置头都调用request_set_header这个函数
连接服务器:
Sock = connect_to_host (conn->host,conn->port)
如果是host为ip地址,那么就直接连接,如果不是,首先查找dns cache
Dns_cachehash table(如果给出的是host,就得获得host的ip)
此hash 是通过算法把host字符串算成一个int 的key,然后再求余算索引,然后hash处理冲突才用开放定址法。
创建key算法(key为host,算出来的结果为hash的key
static unsigned long hash_string_nocase (const void *key) { const char *p = key; unsigned int h = c_tolower (*p); if (h) for (p += 1; *p != ‘\0‘; p++) h = (h << 5) - h + c_tolower (*p); return h; }
查找hash key 算法是开放定址法,这里就不说了。
Hash表的value为structaddress_list *al
struct address_list { int count; /* number of adrresses */ ip_address *addresses; /* pointer to the string of addresses */ int faulty; /* number of addresses known not to work. */ bool connected; /* whether we were able to connect to one of the addresses in the list, at least once. */ int refcount; /* reference count; when it drops to 0, the entry is freed. */ }; typedef struct { /* Address family, one of AF_INET or AF_INET6. */ int family; /* The actual data, in the form of struct in_addr or in6_addr: */ union { struct in_addr d4; /* IPv4 address */ #ifdef ENABLE_IPV6 struct in6_addr d6; /* IPv6 address */ #endif } data; /* Under IPv6 getaddrinfo also returns scope_id. Since it‘s Pv6-specific it strictly belongs in the above union, but we put it here for simplicity. */ #if defined ENABLE_IPV6 && defined HAVE_SOCKADDR_IN6_SCOPE_ID int ipv6_scope; #endif } ip_address;
以下是hash表提供的接口:
cache_query(host) //search cache_remove(host) //delete cache_store(host,val) //insert
如果在dns hash table中找不到,就调用gethostbyname api来获取host的ip,对每一个ip port进行connect,直到连接成功为止。连接成功host后,把req组包发送出去(request_send)
读取回复头:
Head = read_http_response_head(sock)
此时使用了select做为事件超时和MSG_PEEK预先读取内核socket read buffer数据,但是数据不删除,直到找到\r\n\r\n(fd_peek),然后进行实际读取(fd_read)
New 回复数据包:
resp = resp_new (head);
解析数据包
读取body部分:
hs->res = fd_read_body (sock, fp,contlen != -1 ? contlen : 0,
hs->restval, &hs->rd_size, &hs->len,&hs->dltime,flags);
原文:http://blog.csdn.net/youkuxiaobin/article/details/23618793