首页 > 其他 > 详细

lxml解析库

时间:2019-08-12 19:37:32      阅读:77      评论:0      收藏:0      [点我收藏+]

安装

sudo pip3 install lxml

使用流程

1、导模块
   from lxml import etree
2、创建解析对象
   parse_html = etree.HTML(html)
3、解析对象调用xpath
   r_list = parse_html.xpath(xpath表达式)
用xpath 结果一定是列表

示例:html样本

<div class="wrapper">
    <a href="/" id="channel">新浪社会</a>
    <ul id="nav">
        <li><a href="http://domestic.sina.com/" title="国内">国内</a></li>
        <li><a href="http://world.sina.com/" title="国际">国际</a></li>
        <li><a href="http://mil.sina.com/" title="军事">军事</a></li>
        <li><a href="http://photo.sina.com/" title="图片">图片</a></li>
        <li><a href="http://society.sina.com/" title="社会">社会</a></li>
        <li><a href="http://ent.sina.com/" title="娱乐">娱乐</a></li>
        <li><a href="http://tech.sina.com/" title="科技">科技</a></li>
        <li><a href="http://sports.sina.com/" title="体育">体育</a></li>
        <li><a href="http://finance.sina.com/" title="财经">财经</a></li>
        <li><a href="http://auto.sina.com/" title="汽车">汽车</a></li>
    </ul>
</div>

 实现代码:

# 创建解析对象
parse_html = etree.HTML(html)
# 调用xpath返回结束,text()为文本内容
r_list = parse_html.xpath(//a/text())
#print(rList)

# 提取所有的href的属性值
r2 = parse_html.xpath(//a/@href)
#print(r2)

# 提取所有href的值,不包括 / 
r3 = parse_html.xpath(//ul[@id="nav"]/li/a/@href)
#print(r3)

# 获取 图片、军事、...,不包括新浪社会
r4 = parse_html.xpath(//ul[@id="nav"]/li/a/text())
for r in r4:
    print(r)

xpath最常使用方法

1、先匹配节点对象列表
  # r_list: [‘节点对象1‘,‘节点对象2‘]
  r_list = parse_html.xpath(基准xpath表达式)
2、遍历每个节点对象,利用节点对象继续调用 xpath
  for r in r_list:
        name = r.xpath(./xxxxxx)
        star = r.xpath(.//xxxxx)
        time = r.xpath(.//xxxxx)

链家二手房案例(xpath)

实现步骤

  1.确定是否为静态

    打开二手房页面 -> 查看网页源码 -> 搜索关键字

  2.xpath表达式

1、基准xpath表达式(匹配每个房源信息节点列表)
  //ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]

2、依次遍历后每个房源信息xpath表达式
   * 名称: .//a[@data-el="region"]/text()
   
   # 户型+面积+方位+是否精装
   info_list = .//div[@class="houseInfo"]/text()  [0].strip().split(|)
   * 户型(model): info_list[1]
   * 面积(area): info_list[2]
   * 方位(direction): info_list[3]
   * 精装(perfect): info_list[4]
   

   * 楼层(floor): .//div[@class="positionInfo"]/text()
   * 区域(address): .//div[@class="positionInfo"]/a/text()
   * 总价(total_price): .//div[@class="totalPrice"]/span/text()
   * 单价(unit_price): .//div[@class="unitPrice"]/span/text()

  3.实现代码

技术分享图片
import requests
from lxml import etree
import time
import random

class LianjiaSpider(object):
  def __init__(self):
    self.url = https://bj.lianjia.com/ershoufang/pg{}/
    self.headers = {User-Agent : Mozilla/5.0}

  def get_page(self,url):
    try:
        # 设定超时时间,超时后抛出异常,被except捕捉,继续执行此函数再次请求
        res = requests.get(url,headers=self.headers,timeout=5)
        res.encoding = utf-8
        html = res.text
        self.parse_page(html)
    except Exception as e:
        self.get_page(url)

  def parse_page(self,html):
    parse_html = etree.HTML(html)
    # 基准xpath,匹配每个房源信息的节点对象
    li_list = parse_html.xpath(//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"])
    # 定义空字典,用来存储抓取的最终数据
    house_dict = {}
    # 遍历依次匹配每个房源信息,获取所有所需数据
    for li in li_list:
      # 房源名称
      name_list = li.xpath(.//a[@data-el="region"]/text())
      house_dict[house_name] = [ name_list[0] if name_list else None ][0]

      # 列表:户型+面积+方位+是否精装
      info_list = li.xpath(.//div[@class="houseInfo"]/text())
      house_info = [ info_list[0].strip().split(|) if info_list else None ][0]
      if house_info:
          # 户型
          house_dict[house_model] = house_info[1]
          # 面积
          house_dict[area] = house_info[2]
          # 方位
          house_dict[direction] = house_info[3]
          # 是否精装
          house_dict[hardcover] = house_info[4]
      ###########################################
      # 楼层
      floor_list = li.xpath(.//div[@class="positionInfo"]/text())
      house_dict[floor] = [ floor_list[0].strip()[:-2] if floor_list else None ][0]
      # 区域
      address_list = li.xpath(.//div[@class="positionInfo"]/a/text())
      house_dict[address] = [ address_list[0].strip() if address_list else None ][0]
      # 总价
      total_list = li.xpath(.//div[@class="totalPrice"]/span/text())
      house_dict[total_price] = [ total_list[0].strip() if total_list else None ][0]
      # 单价
      unit_list = li.xpath(.//div[@class="unitPrice"]/span/text())
      house_dict[unit_price] = [ unit_list[0].strip() if unit_list else None ][0]

      print(house_dict)

  def main(self):
    for pg in range(1,11):
      url = self.url.format(str(pg))
      self.get_page(url)
      print(第%d页爬取成功 % pg)
      time.sleep(random.randint(1,3))

if __name__ == __main__:
  start = time.time()
  spider = LianjiaSpider()
  spider.main()
  end = time.time()
  print(执行时间:%.2f % (end-start))
代码实现

百度贴吧图片抓取

目标:抓取指定贴吧所有图片

思路:

1、获取贴吧主页URL,下一页,找到不同页的URL规律
2、获取1页中所有帖子URL地址: [帖子链接1,帖子链接2,...]
3、对每个帖子链接发请求,获取图片URL
4、向图片的URL发请求,以wb方式写入本地文件

实现步骤:

  1.贴吧url规律

http://tieba.baidu.com/f?kw=??&pn=50

  2.xpath表达式

1、帖子链接xpath
   //div[@class="t_con cleafix"]/div/div/div/a/@href
    
2、图片链接xpath
   //div[@class="d_post_content j_d_post_content  clearfix"]/img[@class="BDE_Image"]/@src
    
3、视频链接xpath
   //div[@class="video_src_wrapper"]/embed/@data-video
   # 注意: 此处视频链接前端对响应内容做了处理,需要查看网页源代码来查看,复制HTML代码在线格式化

  3.代码实现

技术分享图片
import requests
from urllib import parse
from lxml import etree
import time 
import random

class BaiduImgSpider(object):
  def __init__(self):
    self.url = http://tieba.baidu.com/f?{}
    self.headers = {User-Agent:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)}

  # 获取html函数
  def get_html(self,url):
      try:
          res = requests.get(url=url,headers=self.headers)
          res.encoding = utf-8
          html = res.text

          return html
      except Exception as e:
          self.get_html(url)

  # 解析html函数
  def xpath_func(self,xpath_bds,html):
      parse_html = etree.HTML(html)
      r_list = parse_html.xpath(xpath_bds)

      return r_list


  # 一级页面:获取帖子链接,最终搞定所有图片下载
  # 还记得吗?多级页面抓取所有数据都在一级页面中搞定!!!
  def get_tlink(self,url):
    html = self.get_html(url)
    xpath_bds = //div[@class="t_con cleafix"]/div/div/div/a/@href
    # tlink_list: [‘/p/23234‘,‘/p/9032323‘]
    tlink_list = self.xpath_func(xpath_bds,html)
    # 依次遍历每个帖子链接,搞定所有的图片下载
    if tlink_list:
        for tlink in tlink_list:
          t_url = http://tieba.baidu.com + tlink
          # 提取图片链接并保存
          self.get_image(t_url)
          time.sleep(random.randint(1,3))
    else:
        print(No Data)

  # 获取图片链接
  def get_image(self,t_url):
    html = self.get_html(t_url)
    # 提取图片链接
    xpath_bds = //*[@class="d_post_content j_d_post_content  clearfix"]/img/@src
    imglink_list = self.xpath_func(xpath_bds,html)

    for imglink in imglink_list:
      html_content = requests.get(imglink,headers=self.headers).content
      filename = imglink[-10:]
      with open(filename,wb) as f:
          f.write(html_content)
          print(%s下载成功 % filename)

  # 指定贴吧名称,起始页和终止页,爬取图片
  def main(self):
    name = input(请输入贴吧名:)
    begin = int(input(请输入起始页:))
    end = int(input(请输入终止页:))
    for page in range(begin,end+1):
      # 查询参数编码
      params = {
        kw : name,
        pn : str( (page-1)*50 )
      }
      params = parse.urlencode(params)
      url = self.url.format(params)

      # 开始获取图片
      self.get_tlink(url)

if __name__ == __main__:
  spider = BaiduImgSpider()
  spider.main()
代码实现

 

lxml解析库

原文:https://www.cnblogs.com/maplethefox/p/11338195.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!