安装
sudo pip3 install lxml
使用流程
1、导模块 from lxml import etree 2、创建解析对象 parse_html = etree.HTML(html) 3、解析对象调用xpath r_list = parse_html.xpath(‘xpath表达式‘)
用xpath 结果一定是列表
示例:html样本
<div class="wrapper"> <a href="/" id="channel">新浪社会</a> <ul id="nav"> <li><a href="http://domestic.sina.com/" title="国内">国内</a></li> <li><a href="http://world.sina.com/" title="国际">国际</a></li> <li><a href="http://mil.sina.com/" title="军事">军事</a></li> <li><a href="http://photo.sina.com/" title="图片">图片</a></li> <li><a href="http://society.sina.com/" title="社会">社会</a></li> <li><a href="http://ent.sina.com/" title="娱乐">娱乐</a></li> <li><a href="http://tech.sina.com/" title="科技">科技</a></li> <li><a href="http://sports.sina.com/" title="体育">体育</a></li> <li><a href="http://finance.sina.com/" title="财经">财经</a></li> <li><a href="http://auto.sina.com/" title="汽车">汽车</a></li> </ul> </div>
实现代码:
# 创建解析对象 parse_html = etree.HTML(html) # 调用xpath返回结束,text()为文本内容 r_list = parse_html.xpath(‘//a/text()‘) #print(rList) # 提取所有的href的属性值 r2 = parse_html.xpath(‘//a/@href‘) #print(r2) # 提取所有href的值,不包括 / r3 = parse_html.xpath(‘//ul[@id="nav"]/li/a/@href‘) #print(r3) # 获取 图片、军事、...,不包括新浪社会 r4 = parse_html.xpath(‘//ul[@id="nav"]/li/a/text()‘) for r in r4: print(r)
xpath最常使用方法
1、先匹配节点对象列表 # r_list: [‘节点对象1‘,‘节点对象2‘] r_list = parse_html.xpath(‘基准xpath表达式‘) 2、遍历每个节点对象,利用节点对象继续调用 xpath for r in r_list: name = r.xpath(‘./xxxxxx‘) star = r.xpath(‘.//xxxxx‘) time = r.xpath(‘.//xxxxx‘)
实现步骤
1.确定是否为静态
打开二手房页面 -> 查看网页源码 -> 搜索关键字
2.xpath表达式
1、基准xpath表达式(匹配每个房源信息节点列表) //ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"] 2、依次遍历后每个房源信息xpath表达式 * 名称: ‘.//a[@data-el="region"]/text()‘ # 户型+面积+方位+是否精装 info_list = ‘.//div[@class="houseInfo"]/text()‘ [0].strip().split(‘|‘) * 户型(model): info_list[1] * 面积(area): info_list[2] * 方位(direction): info_list[3] * 精装(perfect): info_list[4] * 楼层(floor): ‘.//div[@class="positionInfo"]/text()‘ * 区域(address): ‘.//div[@class="positionInfo"]/a/text()‘ * 总价(total_price): ‘.//div[@class="totalPrice"]/span/text()‘ * 单价(unit_price): ‘.//div[@class="unitPrice"]/span/text()‘
3.实现代码
import requests from lxml import etree import time import random class LianjiaSpider(object): def __init__(self): self.url = ‘https://bj.lianjia.com/ershoufang/pg{}/‘ self.headers = {‘User-Agent‘ : ‘Mozilla/5.0‘} def get_page(self,url): try: # 设定超时时间,超时后抛出异常,被except捕捉,继续执行此函数再次请求 res = requests.get(url,headers=self.headers,timeout=5) res.encoding = ‘utf-8‘ html = res.text self.parse_page(html) except Exception as e: self.get_page(url) def parse_page(self,html): parse_html = etree.HTML(html) # 基准xpath,匹配每个房源信息的节点对象 li_list = parse_html.xpath(‘//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]‘) # 定义空字典,用来存储抓取的最终数据 house_dict = {} # 遍历依次匹配每个房源信息,获取所有所需数据 for li in li_list: # 房源名称 name_list = li.xpath(‘.//a[@data-el="region"]/text()‘) house_dict[‘house_name‘] = [ name_list[0] if name_list else None ][0] # 列表:户型+面积+方位+是否精装 info_list = li.xpath(‘.//div[@class="houseInfo"]/text()‘) house_info = [ info_list[0].strip().split(‘|‘) if info_list else None ][0] if house_info: # 户型 house_dict[‘house_model‘] = house_info[1] # 面积 house_dict[‘area‘] = house_info[2] # 方位 house_dict[‘direction‘] = house_info[3] # 是否精装 house_dict[‘hardcover‘] = house_info[4] ########################################### # 楼层 floor_list = li.xpath(‘.//div[@class="positionInfo"]/text()‘) house_dict[‘floor‘] = [ floor_list[0].strip()[:-2] if floor_list else None ][0] # 区域 address_list = li.xpath(‘.//div[@class="positionInfo"]/a/text()‘) house_dict[‘address‘] = [ address_list[0].strip() if address_list else None ][0] # 总价 total_list = li.xpath(‘.//div[@class="totalPrice"]/span/text()‘) house_dict[‘total_price‘] = [ total_list[0].strip() if total_list else None ][0] # 单价 unit_list = li.xpath(‘.//div[@class="unitPrice"]/span/text()‘) house_dict[‘unit_price‘] = [ unit_list[0].strip() if unit_list else None ][0] print(house_dict) def main(self): for pg in range(1,11): url = self.url.format(str(pg)) self.get_page(url) print(‘第%d页爬取成功‘ % pg) time.sleep(random.randint(1,3)) if __name__ == ‘__main__‘: start = time.time() spider = LianjiaSpider() spider.main() end = time.time() print(‘执行时间:%.2f‘ % (end-start))
目标:抓取指定贴吧所有图片
思路:
1、获取贴吧主页URL,下一页,找到不同页的URL规律 2、获取1页中所有帖子URL地址: [帖子链接1,帖子链接2,...] 3、对每个帖子链接发请求,获取图片URL 4、向图片的URL发请求,以wb方式写入本地文件
实现步骤:
1.贴吧url规律
http://tieba.baidu.com/f?kw=??&pn=50
2.xpath表达式
1、帖子链接xpath //div[@class="t_con cleafix"]/div/div/div/a/@href 2、图片链接xpath //div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src 3、视频链接xpath //div[@class="video_src_wrapper"]/embed/@data-video # 注意: 此处视频链接前端对响应内容做了处理,需要查看网页源代码来查看,复制HTML代码在线格式化
3.代码实现
import requests from urllib import parse from lxml import etree import time import random class BaiduImgSpider(object): def __init__(self): self.url = ‘http://tieba.baidu.com/f?{}‘ self.headers = {‘User-Agent‘:‘Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)‘} # 获取html函数 def get_html(self,url): try: res = requests.get(url=url,headers=self.headers) res.encoding = ‘utf-8‘ html = res.text return html except Exception as e: self.get_html(url) # 解析html函数 def xpath_func(self,xpath_bds,html): parse_html = etree.HTML(html) r_list = parse_html.xpath(xpath_bds) return r_list # 一级页面:获取帖子链接,最终搞定所有图片下载 # 还记得吗?多级页面抓取所有数据都在一级页面中搞定!!! def get_tlink(self,url): html = self.get_html(url) xpath_bds = ‘//div[@class="t_con cleafix"]/div/div/div/a/@href‘ # tlink_list: [‘/p/23234‘,‘/p/9032323‘] tlink_list = self.xpath_func(xpath_bds,html) # 依次遍历每个帖子链接,搞定所有的图片下载 if tlink_list: for tlink in tlink_list: t_url = ‘http://tieba.baidu.com‘ + tlink # 提取图片链接并保存 self.get_image(t_url) time.sleep(random.randint(1,3)) else: print(‘No Data‘) # 获取图片链接 def get_image(self,t_url): html = self.get_html(t_url) # 提取图片链接 xpath_bds = ‘//*[@class="d_post_content j_d_post_content clearfix"]/img/@src‘ imglink_list = self.xpath_func(xpath_bds,html) for imglink in imglink_list: html_content = requests.get(imglink,headers=self.headers).content filename = imglink[-10:] with open(filename,‘wb‘) as f: f.write(html_content) print(‘%s下载成功‘ % filename) # 指定贴吧名称,起始页和终止页,爬取图片 def main(self): name = input(‘请输入贴吧名:‘) begin = int(input(‘请输入起始页:‘)) end = int(input(‘请输入终止页:‘)) for page in range(begin,end+1): # 查询参数编码 params = { ‘kw‘ : name, ‘pn‘ : str( (page-1)*50 ) } params = parse.urlencode(params) url = self.url.format(params) # 开始获取图片 self.get_tlink(url) if __name__ == ‘__main__‘: spider = BaiduImgSpider() spider.main()
原文:https://www.cnblogs.com/maplethefox/p/11338195.html