lxml解析库

时间：2019-08-12 19:37:32 阅读：78 评论：0 收藏：0 [点我收藏+]

安装

sudo pip3 install lxml

使用流程

1、导模块
   from lxml import etree
2、创建解析对象
   parse_html = etree.HTML(html)
3、解析对象调用xpath
   r_list = parse_html.xpath(‘xpath表达式‘)
用xpath 结果一定是列表

示例:html样本

<div class="wrapper">
    <a href="/" id="channel">新浪社会</a>
    <ul id="nav">
        <li><a href="http://domestic.sina.com/" title="国内">国内</a></li>
        <li><a href="http://world.sina.com/" title="国际">国际</a></li>
        <li><a href="http://mil.sina.com/" title="军事">军事</a></li>
        <li><a href="http://photo.sina.com/" title="图片">图片</a></li>
        <li><a href="http://society.sina.com/" title="社会">社会</a></li>
        <li><a href="http://ent.sina.com/" title="娱乐">娱乐</a></li>
        <li><a href="http://tech.sina.com/" title="科技">科技</a></li>
        <li><a href="http://sports.sina.com/" title="体育">体育</a></li>
        <li><a href="http://finance.sina.com/" title="财经">财经</a></li>
        <li><a href="http://auto.sina.com/" title="汽车">汽车</a></li>
    </ul>
</div>

实现代码:

# 创建解析对象
parse_html = etree.HTML(html)
# 调用xpath返回结束,text()为文本内容
r_list = parse_html.xpath(‘//a/text()‘)
#print(rList)

# 提取所有的href的属性值
r2 = parse_html.xpath(‘//a/@href‘)
#print(r2)

# 提取所有href的值,不包括 / 
r3 = parse_html.xpath(‘//ul[@id="nav"]/li/a/@href‘)
#print(r3)

# 获取 图片、军事、...,不包括新浪社会
r4 = parse_html.xpath(‘//ul[@id="nav"]/li/a/text()‘)
for r in r4:
    print(r)

xpath最常使用方法

1、先匹配节点对象列表
  # r_list: [‘节点对象1‘,‘节点对象2‘]
  r_list = parse_html.xpath(‘基准xpath表达式‘)
2、遍历每个节点对象,利用节点对象继续调用 xpath
  for r in r_list:
        name = r.xpath(‘./xxxxxx‘)
        star = r.xpath(‘.//xxxxx‘)
        time = r.xpath(‘.//xxxxx‘)

链家二手房案例(xpath)

实现步骤

　　1.确定是否为静态

　　　　打开二手房页面 -> 查看网页源码 -> 搜索关键字

　　2.xpath表达式

1、基准xpath表达式(匹配每个房源信息节点列表)
  //ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]

2、依次遍历后每个房源信息xpath表达式
   * 名称: ‘.//a[@data-el="region"]/text()‘
   
   # 户型+面积+方位+是否精装
   info_list = ‘.//div[@class="houseInfo"]/text()‘  [0].strip().split(‘|‘)
   * 户型(model): info_list[1]
   * 面积(area): info_list[2]
   * 方位(direction): info_list[3]
   * 精装(perfect): info_list[4]
   

   * 楼层(floor): ‘.//div[@class="positionInfo"]/text()‘
   * 区域(address): ‘.//div[@class="positionInfo"]/a/text()‘
   * 总价(total_price): ‘.//div[@class="totalPrice"]/span/text()‘
   * 单价(unit_price): ‘.//div[@class="unitPrice"]/span/text()‘

　　3.实现代码

import requests
from lxml import etree
import time
import random

class LianjiaSpider(object):
  def __init__(self):
    self.url = ‘https://bj.lianjia.com/ershoufang/pg{}/‘
    self.headers = {‘User-Agent‘ : ‘Mozilla/5.0‘}

  def get_page(self,url):
    try:
        # 设定超时时间,超时后抛出异常,被except捕捉,继续执行此函数再次请求
        res = requests.get(url,headers=self.headers,timeout=5)
        res.encoding = ‘utf-8‘
        html = res.text
        self.parse_page(html)
    except Exception as e:
        self.get_page(url)

  def parse_page(self,html):
    parse_html = etree.HTML(html)
    # 基准xpath,匹配每个房源信息的节点对象
    li_list = parse_html.xpath(‘//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]‘)
    # 定义空字典,用来存储抓取的最终数据
    house_dict = {}
    # 遍历依次匹配每个房源信息,获取所有所需数据
    for li in li_list:
      # 房源名称
      name_list = li.xpath(‘.//a[@data-el="region"]/text()‘)
      house_dict[‘house_name‘] = [ name_list[0] if name_list else None ][0]

      # 列表：户型+面积+方位+是否精装
      info_list = li.xpath(‘.//div[@class="houseInfo"]/text()‘)
      house_info = [ info_list[0].strip().split(‘|‘) if info_list else None ][0]
      if house_info:
          # 户型
          house_dict[‘house_model‘] = house_info[1]
          # 面积
          house_dict[‘area‘] = house_info[2]
          # 方位
          house_dict[‘direction‘] = house_info[3]
          # 是否精装
          house_dict[‘hardcover‘] = house_info[4]
      ###########################################
      # 楼层
      floor_list = li.xpath(‘.//div[@class="positionInfo"]/text()‘)
      house_dict[‘floor‘] = [ floor_list[0].strip()[:-2] if floor_list else None ][0]
      # 区域
      address_list = li.xpath(‘.//div[@class="positionInfo"]/a/text()‘)
      house_dict[‘address‘] = [ address_list[0].strip() if address_list else None ][0]
      # 总价
      total_list = li.xpath(‘.//div[@class="totalPrice"]/span/text()‘)
      house_dict[‘total_price‘] = [ total_list[0].strip() if total_list else None ][0]
      # 单价
      unit_list = li.xpath(‘.//div[@class="unitPrice"]/span/text()‘)
      house_dict[‘unit_price‘] = [ unit_list[0].strip() if unit_list else None ][0]

      print(house_dict)

  def main(self):
    for pg in range(1,11):
      url = self.url.format(str(pg))
      self.get_page(url)
      print(‘第%d页爬取成功‘ % pg)
      time.sleep(random.randint(1,3))

if __name__ == ‘__main__‘:
  start = time.time()
  spider = LianjiaSpider()
  spider.main()
  end = time.time()
  print(‘执行时间:%.2f‘ % (end-start))

代码实现

百度贴吧图片抓取

目标:抓取指定贴吧所有图片

思路:

1、获取贴吧主页URL,下一页,找到不同页的URL规律
2、获取1页中所有帖子URL地址: [帖子链接1,帖子链接2,...]
3、对每个帖子链接发请求,获取图片URL
4、向图片的URL发请求,以wb方式写入本地文件

实现步骤:

　　1.贴吧url规律

http://tieba.baidu.com/f?kw=??&pn=50

　　2.xpath表达式

1、帖子链接xpath
   //div[@class="t_con cleafix"]/div/div/div/a/@href
    
2、图片链接xpath
   //div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src
    
3、视频链接xpath
   //div[@class="video_src_wrapper"]/embed/@data-video
   # 注意: 此处视频链接前端对响应内容做了处理,需要查看网页源代码来查看，复制HTML代码在线格式化

　　3.代码实现

import requests
from urllib import parse
from lxml import etree
import time 
import random

class BaiduImgSpider(object):
  def __init__(self):
    self.url = ‘http://tieba.baidu.com/f?{}‘
    self.headers = {‘User-Agent‘:‘Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)‘}

  # 获取html函数
  def get_html(self,url):
      try:
          res = requests.get(url=url,headers=self.headers)
          res.encoding = ‘utf-8‘
          html = res.text

          return html
      except Exception as e:
          self.get_html(url)

  # 解析html函数
  def xpath_func(self,xpath_bds,html):
      parse_html = etree.HTML(html)
      r_list = parse_html.xpath(xpath_bds)

      return r_list


  # 一级页面:获取帖子链接,最终搞定所有图片下载
  # 还记得吗？多级页面抓取所有数据都在一级页面中搞定！！！
  def get_tlink(self,url):
    html = self.get_html(url)
    xpath_bds = ‘//div[@class="t_con cleafix"]/div/div/div/a/@href‘
    # tlink_list: [‘/p/23234‘,‘/p/9032323‘]
    tlink_list = self.xpath_func(xpath_bds,html)
    # 依次遍历每个帖子链接,搞定所有的图片下载
    if tlink_list:
        for tlink in tlink_list:
          t_url = ‘http://tieba.baidu.com‘ + tlink
          # 提取图片链接并保存
          self.get_image(t_url)
          time.sleep(random.randint(1,3))
    else:
        print(‘No Data‘)

  # 获取图片链接
  def get_image(self,t_url):
    html = self.get_html(t_url)
    # 提取图片链接
    xpath_bds = ‘//*[@class="d_post_content j_d_post_content  clearfix"]/img/@src‘
    imglink_list = self.xpath_func(xpath_bds,html)

    for imglink in imglink_list:
      html_content = requests.get(imglink,headers=self.headers).content
      filename = imglink[-10:]
      with open(filename,‘wb‘) as f:
          f.write(html_content)
          print(‘%s下载成功‘ % filename)

  # 指定贴吧名称,起始页和终止页,爬取图片
  def main(self):
    name = input(‘请输入贴吧名:‘)
    begin = int(input(‘请输入起始页:‘))
    end = int(input(‘请输入终止页:‘))
    for page in range(begin,end+1):
      # 查询参数编码
      params = {
        ‘kw‘ : name,
        ‘pn‘ : str( (page-1)*50 )
      }
      params = parse.urlencode(params)
      url = self.url.format(params)

      # 开始获取图片
      self.get_tlink(url)

if __name__ == ‘__main__‘:
  spider = BaiduImgSpider()
  spider.main()

代码实现

lxml解析库

原文：https://www.cnblogs.com/maplethefox/p/11338195.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)