参考博客:http://cuiqingcai.com/990.html
# -*- coding:utf-8 -*- import urllib import urllib2 import re page = 1 url = "https://www.qiushibaike.com/8hr/page/" + str(page) headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"} try: request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) content = response.read() # 匹配有图的帖子 #pattern = re.compile(‘<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?<img src="(.*?\.jpg)" .*?stats-vote.*?number">(\d+)‘,re.S) # re.S 多行匹配 # 匹配无图的帖子 pattern = re.compile(‘<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(\d+)‘,re.S) # re.S 多行匹配 items = re.findall(pattern,content) for item in items: print item[0], item[1].strip(), item[2] except urllib2.URLError, e: # 确定错误的属性 if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason
与用户交互
# -*- coding:utf-8 -*- import urllib, urllib2 import re import thread import time stories = [] class Qsbk(): """定义一个丑事百科类""" def __init__(self): """初始方法""" self.url = "https://www.qiushibaike.com/8hr/page/" self.headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"} def get_page(self, page): """传入某一页索引的代码""" fullurl = self.url + str(page) try: request = urllib2.Request(url=fullurl, headers=self.headers) response = urllib2.urlopen(request).read() self.get_page_items(response) except urllib2.URLError, e: if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason def get_page_items(self, response): """获取段子列表""" global stories pattern = re.compile(‘<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<span>(.*?)</span>.*?.*?stats-vote.*?number">(\d+)‘,re.S) # re.S 多行匹配 items = re.findall(pattern,response) for item in items: stories.append(item[0].strip()+"\n"+ item[2].strip()+"\n"+ item[1].strip().replace("<br>", "").replace("<br/>", "")) def load_page(self, page): """如果当列表中少于10,则加载新一页""" self.get_page(page) def get_one_story(self): """调用此方法,打印一个段子""" global stories print "--------------------------------------------------------------------------------------" print stories.pop(0) print "--------------------------------------------------------------------------------------\n" def main(): """控制函数""" print "段子加载中..." qsbk = Qsbk() page = 0 qsbk.load_page(page) while True: option = raw_input("按任意键看段,按q退出:") if "q" == option: break else: if len(stories) < 10: page += 1 qsbk.load_page(page) qsbk.get_one_story() if __name__ == "__main__": main()
原文:http://www.cnblogs.com/cuzz/p/7707596.html