首页 > 其他 > 详细

糗事百科正则爬虫

时间:2017-10-22 10:15:16      阅读:195      评论:0      收藏:0      [点我收藏+]

参考博客:http://cuiqingcai.com/990.html

# -*- coding:utf-8 -*- 
import urllib
import urllib2
import re

page = 1

url = "https://www.qiushibaike.com/8hr/page/" + str(page)
headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"}
try:
    request = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(request)
    content = response.read()
    # 匹配有图的帖子
    #pattern = re.compile(‘<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<img src="(.*?\.jpg)" .*?stats-vote.*?number">(\d+)‘,re.S) # re.S 多行匹配
    # 匹配无图的帖子
    pattern = re.compile(<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(\d+),re.S) # re.S 多行匹配
    items = re.findall(pattern,content)
    for item in items:
        print item[0], item[1].strip(), item[2]
except urllib2.URLError, e:
    # 确定错误的属性
    if hasattr(e, "code"):
        print e.code
    if hasattr(e, "reason"):
        print e.reason
    

与用户交互

# -*- coding:utf-8 -*-

import urllib, urllib2
import re
import thread
import time 
stories = []

class Qsbk():
    """定义一个丑事百科类"""
    def __init__(self):
        """初始方法"""
        self.url = "https://www.qiushibaike.com/8hr/page/"        
        self.headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"}

    def get_page(self, page):
        """传入某一页索引的代码"""
        fullurl = self.url + str(page)
        try:
            request = urllib2.Request(url=fullurl, headers=self.headers)
            response = urllib2.urlopen(request).read()
            self.get_page_items(response)
        except urllib2.URLError, e:
            if hasattr(e, "code"):
                print e.code
            if hasattr(e, "reason"):
                print e.reason
       

    def get_page_items(self, response):
        """获取段子列表"""
        global stories
        pattern = re.compile(<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(\d+),re.S) # re.S 多行匹配
        items = re.findall(pattern,response)
        for item in items:
            stories.append(item[0].strip()+"\n"+ item[2].strip()+"\n"+ item[1].strip().replace("<br>", "").replace("<br/>", ""))

    def load_page(self, page):
        """如果当列表中少于10,则加载新一页"""
        self.get_page(page)

    def get_one_story(self):
        """调用此方法,打印一个段子"""
        global stories
        print "--------------------------------------------------------------------------------------"
        print stories.pop(0)
        print "--------------------------------------------------------------------------------------\n"        
    
def main():
    """控制函数"""
    print "段子加载中..."
    qsbk = Qsbk()
    page = 0       
    qsbk.load_page(page)
    while True:

        option = raw_input("按任意键看段,按q退出:")
        if "q" == option:
            break
        else:
            if len(stories) < 10:
                page += 1
                qsbk.load_page(page)
            qsbk.get_one_story()


if __name__ == "__main__":
    main()
     

 

糗事百科正则爬虫

原文:http://www.cnblogs.com/cuzz/p/7707596.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!