1.常用指令
import re #提取出python # key="javapythonc++php" # re.findall(‘python‘,key)[0] # #提取出hello world # key="<html><h1>hello world<h1></html>" # re.findall(‘<h1>(.*?)<h1>‘,key)[0] # #提取170 # string = ‘我喜欢身高为170的女孩‘ # re.findall(‘\d+‘,string)[0] # #提取出http://和https:// # key=‘http://www.baidu.com and https://boob.com‘ # re.findall(‘https?://‘,key) # #提取出hello # key=‘lalala<hTml>hello</HtMl>hahah‘ #输出<hTml>hello</HtMl> # #提取出hit. key=‘bobo@hit.edu.com‘#想要匹配到hit. re.findall(‘h.*?\.‘,key) # #匹配sas和saas # key=‘saas and sas and saaas‘ # #匹配出i开头的行 # string = ‘‘‘fall in love with you # i love you very much # i love she # i love her‘‘‘ # re.findall(‘^i.*‘,string,re.M) # #匹配全部行 string1 = """<div>细思极恐 你的队友在看书 你的闺蜜在减肥 你的敌人在磨刀 隔壁老王在炼药 </div>""" re.findall(‘.*‘,string1,re.S)
#解析糗事百科糗图下所有的图片数据 import re import requests from urllib import request import os #1.检查页面数据是否为动态加载出来的 #2.获取页面源码数据 if not os.path.exists(‘qiutu‘): os.mkdir(‘qiutu‘) headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36‘ } url = ‘https://www.qiushibaike.com/pic/‘ page_text = requests.get(url=url,headers=headers).text #3.解析img标签的src属性值 ex = ‘<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>‘ img_url_list = re.findall(ex,page_text,re.S) for img_url in img_url_list: img_url = ‘https:‘+img_url imgPath = ‘qiutu/‘+img_url.split(‘/‘)[-1] #4.对图片url发请求 #5.持久化存储 request.urlretrieve(url=img_url,filename=imgPath) print(imgPath+‘下载成功!!!‘)
pip install lxml
1.简历模板爬取 import requests import os from lxml import etree import random headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36‘ } url = ‘http://sc.chinaz.com/jianli/free.html‘ response = requests.get(url=url,headers=headers) response.encoding = ‘utf-8‘ page_text = response.text if not os.path.exists(‘jianli‘): os.mkdir(‘jianli‘) tree = etree.HTML(page_text) div_list = tree.xpath(‘//div[@id="container"]/div‘) for div in div_list: detail_url = div.xpath(‘./a/@href‘)[0] name = div.xpath(‘./a/img/@alt‘)[0] detail_page_text = requests.get(url=detail_url,headers=headers).text tree = etree.HTML(detail_page_text) download_url_list = tree.xpath(‘//div[@class="clearfix mt20 downlist"]/ul/li/a/@href‘) download_url = random.choice(download_url_list) jianli_data = requests.get(url=download_url,headers=headers).content file_path = ‘jianli/‘+name+‘.rar‘ with open(file_path,‘wb‘) as fp: fp.write(jianli_data) print(file_path+‘下载成功‘) ######处理多页 import requests import os from lxml import etree import random headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36‘, ‘Connection‘:‘close‘ } start_page = 1 end_page = 5 if not os.path.exists(‘jianli‘): os.mkdir(‘jianli‘) url = ‘http://sc.chinaz.com/jianli/free_%d.html‘ for page in range(start_page,end_page+1): if page == 1: new_url = ‘http://sc.chinaz.com/jianli/free.html‘ else: new_url = format(url%page) response = requests.get(url=new_url,headers=headers) response.encoding = ‘utf-8‘ page_text = response.text tree = etree.HTML(page_text) div_list = tree.xpath(‘//div[@id="container"]/div‘) for div in div_list: detail_url = div.xpath(‘./a/@href‘)[0] name = div.xpath(‘./a/img/@alt‘)[0] detail_page_text = requests.get(url=detail_url,headers=headers).text tree = etree.HTML(detail_page_text) download_url_list = tree.xpath(‘//div[@class="clearfix mt20 downlist"]/ul/li/a/@href‘) download_url = random.choice(download_url_list) jianli_data = requests.get(url=download_url,headers=headers).content file_path = ‘jianli/‘+name+‘.rar‘ with open(file_path,‘wb‘) as fp: fp.write(jianli_data) print(file_path+‘下载成功‘)
原文:https://www.cnblogs.com/TodayWind/p/13767810.html