首页 > 其他 > 详细

<爬虫实例> 8684公交网-太原公交线路信息

时间:2019-07-05 15:08:15      阅读:127      评论:0      收藏:0      [点我收藏+]
 1 import requests
 2 from lxml import etree
 3 
 4 ‘‘‘访问“8684公交查询网”,抓取太原市公交路线:‘‘‘
 5 
 6 headers = {User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) 
 7                          AppleWebKit/537.36 (KHTML, like Gecko) 
 8                          Chrome/73.0.3683.86 Safari/537.36}
 9 
10 #用来保存公交路线信息
11 items = []
12 
13 def get_navigation():
14     ‘‘‘获取内容‘‘‘
15     url = https://taiyuan.8684.cn/
16     print(正在获取导航链接)
17     r = requests.get(url,headers=headers)
18 
19     #解析内容,获取导航跳转链接
20     tree = etree.HTML(r.text)
21     #查找以数字开头的路线链接
22     num_href_list = tree.xpath(//div[@class="bus_kt_r1"]/a/@href)
23     # 查找以字母开头的路线链接
24     char_href_list = tree.xpath(//div[@class="bus_kt_r2"]/a/@href)
25     #返回导航链接
26     return num_href_list + char_href_list
27 
28 def get_route(navi_list):
29     #循环遍历链接列表,发送请求,获取每个链接下的公交路线
30     route_list = []
31     for i in navi_list:
32         route_url = https://taiyuan.8684.cn + i
33         print(正在获取以%s开头的公交路线 %i)
34         r = requests.get(route_url,headers=headers)
35 
36         #解析内容,获取公交路线
37         tree = etree.HTML(r.text)
38         href_list = tree.xpath(//div[@id="con_site_1"]/a/@href)
39         for href in href_list:
40             route_list.append(href)
41     return route_list
42 
43 def get_info(route_list):
44     for route in route_list:
45         info_url = https://taiyuan.8684.cn + route
46         r = requests.get(info_url,headers=headers)
47 
48         #解析获取具体信息
49         tree = etree.HTML(r.text)
50         route_name = tree.xpath(//div[@class="bus_i_t1"]/h1/text())[0]
51         print(正在获取%s的路线信息 % route_name)
52         run_time = tree.xpath(//p[@class="bus_i_t4"][1]/text())[0]
53         ticket_price = tree.xpath(//p[@class="bus_i_t4"][2]/text())[0]
54         update_time = tree.xpath(//p[@class="bus_i_t4"][4]/text())[0]
55         station_num = tree.xpath(//div[@class="bus_line_top "]/span/text())
56         if len(station_num) == 2:
57             up_num = station_num[0]
58             up_station_name = tree.xpath(//div[@class="bus_line_site "][1]/div/div/a/text())
59             down_num = station_num[1]
60             down_station_name = tree.xpath(//div[@class="bus_line_site "][2]/div/div/a/text())
61         else:
62             up_num = station_num[0]
63             down_num = station_num[0]
64             up_station_name = tree.xpath(//div[@class="bus_line_site "]/div/div/a/text())
65             down_station_name = tree.xpath(//div[@class="bus_line_site "]/div/div/a/text())
66 
67         #写入字典
68         item = {路线名:route_name,
69                 运行时间:run_time,
70                 票价:ticket_price,
71                 更新时间:update_time,
72                 上行站数:up_num,
73                 上行站名:up_station_name,
74                 下行站数:down_num,
75                 下行站名:down_station_name,}
76 
77         items.append(item)
78 
79 def main():
80     #获取所有公交路线导航链接
81     navi_list = get_navigation()
82     print(导航链接爬取完毕)
83 
84     #循环遍历导航链接列表,找到所有公交路线
85     route_list = get_route(navi_list)
86     print(公交路线爬取完毕)
87 
88     # 遍历路线表,获取具体信息
89     info_list = get_info(route_list)
90     print(具体信息爬取完毕)
91 
92     #爬取完毕,写入文件
93     fp = open(8684_太原公交路线.txt,w,encoding=utf8)
94     for item in items:
95         fp.write(str(item) + \n)
96     fp.close()
97 
98 if __name__ == __main__:
99     main()

 

<爬虫实例> 8684公交网-太原公交线路信息

原文:https://www.cnblogs.com/Finance-IT-gao/p/11136977.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!