re.S让.能够匹配\n,默认情况点是不能匹配换行符的
1.爬取网页源码中的图片
#-*- coding:utf-8 -*- import re import requests with open(‘source.txt‘, ‘r‘) as f: html = f.read() #匹配图片网址,括号中为需要返回的内容 pic_url = re.findall(‘img src="(.*?)" class="lessonimg"‘, html, re.M) i = 0 for each in pic_url: print "now downloading:"+each pic = requests.get(each) fp = open(‘pic\\‘+str(i)+‘.jpg‘, ‘wb‘) fp.write(pic.content) fp.close() i += 1
2.突破反爬虫机制伪装成浏览器设置headers
#-*- coding:utf-8 -*- import requests import sys import re #很多情况下sys.defaultencoding是ascii reload(sys) sys.setdefaultencoding("utf-8") type = sys.getdefaultencoding() print type headers = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; rv:42.0) Gecko/20100101 Firefox/42.0‘} html = requests.get(‘http://jp.tingroom.com/yuedu/yd300p/‘, headers=headers) html.encoding = ‘utf-8‘ print html.text for japanese in re.findall(‘<span style="color:#666666;">(.*?)</span>‘, html.text, re.S): print japanese for chinese in re.findall(‘style="color: #039;">(.*?)</a>‘, html.text, re.S): print chinese
3.发起post请求
#-*- coding:utf-8 -*- import re import requests url = ‘http://www.crowdfunder.com/browse/deals&template=false‘ data = { ‘entities_only‘: ‘true‘, ‘page‘: ‘3‘ } html_post = requests.post(url, data=data) title = re.findall(‘class="card-title">(.*?)</div>‘, html_post.text, re.S) print title for each in title: print each
4.爬取极客学院课程详细信息
re.search匹配第一个
re.findall匹配所有
# coding=utf-8 __author__ = ‘scaleworld‘ import requests import re import sys reload(sys) sys.setdefaultencoding("utf-8") class Spider(object): def __init__(self): print ‘开始爬取极客学院课程信息。。。‘ # 获取源代码 def getsource(self, url): html = requests.get(url) return html.text # 获取每个课程块信息 def getlessons(self, source): lessons = re.findall(‘deg="0" >(.*?)</li>‘, source, re.S) return lessons # 获取课程信息,如课程名称、课程介绍、课程时间、课程等级、学习人数 def getlessonInfo(self, lesson): info = {} info[‘title‘] = re.search(‘<h2 class="lesson-info-h2"><a(.*?)>(.*?)</a></h2>‘, lesson, re.S).group(2).strip() info[‘desc‘] = re.search(‘<p style="height: 0px; opacity: 0; display: none;">(.*?)</p>‘, lesson, re.S).group( 1).strip() timeandlevel = re.findall(‘<em>(.*?)</em>‘, lesson, re.S) info[‘time‘] = timeandlevel[0].strip().replace("\n", "").replace(" ", "") info[‘level‘] = timeandlevel[1].strip() info[‘learnNumber‘] = re.search(‘"learn-number">(.*?)</em>‘, lesson, re.S).group(1).strip() return info # 保存课程信息到文件LessionInfos.txt def savelessionInfos(self, lessonInfos): # ‘w‘:只写,会覆盖之前写入的内容 # 也可以用‘a‘:追加到文件末尾 # 如果文件不存在,则自动创建文件 f = open(‘LessionInfos.txt‘, ‘w‘) i = 0 for each in lessonInfos: i += 1 f.writelines(‘第‘ + str(i) + ‘个课程:\n‘) f.writelines(‘title:‘ + each[‘title‘] + ‘\n‘) f.writelines(‘desc:‘ + each[‘desc‘] + ‘\n‘) f.writelines(‘time:‘ + each[‘time‘] + ‘\n‘) f.writelines(‘level:‘ + each[‘level‘] + ‘\n‘) f.writelines(‘learnNumber:‘ + each[‘learnNumber‘] + ‘\n\n‘) f.close() if __name__ == ‘__main__‘: # 定义课程信息数组 lessonInfos = [] # 课程信息页面url url = ‘http://www.jikexueyuan.com/course/‘ # 实例化爬虫 spider = Spider() # 取[1,21)及1到20页的课程信息 for i in range(1, 21): # 构建分页URL pageUrl = url + ‘?pageNum=‘ + str(i) print ‘正在处理页面:‘ + pageUrl source = spider.getsource(pageUrl) lessons = spider.getlessons(source) for lesson in lessons: lessonInfo = spider.getlessonInfo(lesson) lessonInfos.append(lessonInfo) # print ‘title:‘+lessonInfo.get(‘title‘) #函数返回指定键的值,如果值不在字典中返回默认值,不会报异常 # print ‘desc:‘+lessonInfo.get(‘desc‘) # print ‘time:‘+lessonInfo.get(‘time‘) # print ‘level:‘+lessonInfo.get(‘level‘) # print ‘learnNumber:‘+lessonInfo.get(‘learnNumber‘) print ‘已处理‘ + str(lessons.__len__()) + ‘个课程信息。‘ print ‘极客学院课程信息爬取完毕,正在保存课程信息。。。‘ spider.savelessionInfos(lessonInfos) print ‘极客学院课程信息保存完毕。‘
本文出自 “点滴积累” 博客,请务必保留此出处http://tianxingzhe.blog.51cto.com/3390077/1726527
原文:http://tianxingzhe.blog.51cto.com/3390077/1726527