python—爬虫

时间：2018-01-12 22:31:36 阅读：363 评论：0 收藏：0 [点我收藏+]

1.1 介绍

通过过滤和分析HTML代码，实现对文件、图片等资源的获取，一般用到：

urllib和urllib2模块
正则表达式（re模块）
requests模块
Scrapy框架

urllib库：

1）获取web页面

2）在远程http服务器上验证

3）表单提交（GET和POST）

4）异常处理（urllib2.URLError）

5）非http协议通信（ftp）

获取页面信息：

urllib2.urlopen(url,data,timeout)

构造Request

reques = URLlib.Request(url,data,headers={})

response = urllib2.urlopen(request)

response.read()

1.2 爬取图片

百度贴吧示例链接： http://tieba.baidu.com/p/4229162765

1）从网页链接源代码中查找数据，用于分析和提取url

需下载的某张图片的url：

2）脚本

#! /usr/bin/env python
import urllib,urllib2
import re
def getHtml(url):
    page = urllib2.urlopen(url)
    return page.read()
    
def getImage(html):
    re_img = re.compile(r'<img class="BDE_Image" src="(.*?)".*?')      #“（.*?）”问号表示非贪婪模式，匹配到最接近的双引号”，而不加问号则匹配到最后
    img_list = re_img.findall(html)
    i = 1
    for imgurl in img_list:
        print imgurl
        urllib.urlretrieve(imgurl,filename="%s.jpg" %i)   #urllib.urlliburlretrieve下载，不指定文件名，则保持在当前目录
        i = i+1
        
if __name__ == "__main__":
    url = "http://tieba.baidu.com/p/4229162765"
    html = getHtml(url)
    getImage(html)

运行结果：

技术分享图片

1.3 爬取文本—获取作者、内容、点赞数

糗事百科热门段子示例链接：

第一页：https://www.qiushibaike.com/

第二页：https://www.qiushibaike.com/8hr/page/2/

第三页：https://www.qiushibaike.com/8hr/page/3/

1）随机选择一个段子，审查元素，获取：作者、内容、点赞个数url位置，用于定义正常表达式

作者位置<<<<<<

技术分享图片

内容和点赞数位置<<<<<<

技术分享图片

2）>>>>>>脚本版本一：

#!/usr/bin/env python
import urllib,urllib2
import re

page = 1
url = "https://www.qiushibaike.com/8hr/page/" +str(page)
headers = {"user-agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}  #访问正规的网站，一般需要user-agent

try:
    request = urllib2.Request(url,headers=headers)
    response = urllib2.urlopen(request)
    html = response.read()
    
except urllib2.URLError,e:
    if hasattr(e,"code"):  #抛出异常时，e表示前面的错误类；判断该类中是否有code属性，有则打印出来
        print e.code
    if hasattr(e,"reason"):
        print e.reason
        
re_page = re.compile(r'<div class="author.*?<a.*?<img.*?alt="(.*?)">.*?<div.*?<span>(.*?)</span>.*?<i class="number">(\d+)</i>',re.S)  #正则表达式，re.S表示点号可以代表换行符

items = re_page.findall(html)
for item in items:
    for i in item:
        print i

运行结果：

>>>>>>>脚本版本二：

替换掉内容中网页换行符<br/>，然后去掉空格行，显示页数默认为第一页

#!/usr/bin/env python
#coding:utf-8
import urllib,urllib2
import re

def getPage(page_num=1):
    url = "https://www.qiushibaike.com/8hr/page/" +str(page_num)
    headers = {"user-agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
    try:
        request = urllib2.Request(url,headers=headers)
        response = urllib2.urlopen(request)
        html = response.read()
        return html
        
    except urllib2.URLError,e:
        if hasattr(e,"code"):
            print ("连接服务器失败，错误代码 %s" %e.code)
            return None
            
        if hasattr(e,"reason"):
            print ("连接服务器失败，错误原因 %s" %e.reason)
            return None
            
def getPageCoent(page_num=1):
    html = getPage(page_num)
    re_page = re.compile(r'<div class="author.*?<a.*?<img.*?alt="(.*?)">.*?<div.*?<span>(.*?)</span>.*?<i class="number">(\d+)</i>',re.S)
    items = re_page.findall(html)
    page_contents = []
    replaceBR = re.compile(r'<br/>')
    for item in items:
        content = item[1]
        content = replaceBR.sub('\n',content)
        page_contents.append([page_num,
                            item[0].strip(),
                            content.strip(),
                            item[2].strip()])
    return page_contents
    
if __name__ == "__main__":
   page_content = getPageCoent(1)
   for item in page_content:
       for i in item:
           print str(i) +"\n"

运行结果：

技术分享图片

>>>>>>脚本版本三：

实现交互式爬取

即每按一次enter键，显示一条段子，内容包括：页码、作者、段子内容、点赞数

#!/usr/bin/env python
#coding:utf-8
import urllib,urllib2
import re
import sys

def getPage(page_num=1):
    url = "https://www.qiushibaike.com/8hr/page/" +str(page_num)
    headers = {"user-agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
    
    try:
        request = urllib2.Request(url,headers=headers)
        response = urllib2.urlopen(request)
        html = response.read()
        return html
                
    except urllib2.URLError,e:
        if hasattr(e,"code"):
            print ("连接服务器失败，错误代码 %s" %e.code)
            return None
            
        if hasattr(e,"reason"):
            print ("连接服务器失败，错误原因 %s" %e.reason)
            return None
            
def getPageCoent(page_num=1):
    html = getPage(page_num)
    re_page = re.compile(r'<div class="author.*?<a.*?<img.*?alt="(.*?)">.*?<div.*?<span>(.*?)</span>.*?<i class="number">(\d+)</i>',re.S)
    items = re_page.findall(html)
    page_contents = []
    replaceBR = re.compile(r'<br/>')
    for item in items:
        content = item[1]
        content = replaceBR.sub('\n',content)
        page_contents.append([page_num,
                            item[0].strip(),
                            content.strip(),
                            item[2].strip()])
    return page_contents
    
def getOneStory(page_contents):
    for story in page_contents:
        input = raw_input()
        if input == "q" or input == "Q":
            sys.exit()
        print "第%d页\t发布人:%s\t赞:%s\n%s\n" %(story[0],story[1],story[3],story[2])
        
if __name__ == "__main__":
    print "正在读取段子，按回车看新段子，按(Q|q)退出"
    num = 1
    page_contents = getPageCoent(num)
    while True:
        page_contents = getPageCoent(num)
        getOneStory(page_contents)
        num += 1

运行结果：

技术分享图片

python—爬虫

原文：http://blog.51cto.com/huangzp/2060411

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)