通过编写程序模拟浏览器上网,去互联网上抓取数据
遵从浏览器上网流程
约束爬虫程序的数据爬虫,相当于口头协议,并没有强制
通过相应的策略和技术手段,防止数据的爬取
破解反爬虫手段
import urllib.request
url='https://www.baidu.com/'
response=urllib.request.urlopen(url=url)
page_text=response.read()
print(page_text)
with open('1.html','wb') as f:
f.write(page_text)
print('写入')
url里面不能存在非ASCII码编码的字符
字符编码的转换
import urllib.request
import urllib.parse
url='http://www.baidu.com/s?wd='
word=urllib.parse.quote('人民币') # 对中文进行编码
url+=word # 将编码传给url
response=urllib.request.urlopen(url=url)
page_text=response.read()
with open('2.html','wb') as f:
f.write(page_text)
print('写入')
User-Agent(UA):请求载体的身份标识
网站检查请求的UA
反反爬机制:伪装爬虫程序请求的UA
import urllib.request
url='http://www.baidu.com/'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
} # 自定义响应头
request=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(request)
text=response.read()
with open('3.html','wb')as f:
f.write(text)
netwo的XHR里面去找
import urllib.request
import urllib.parse
url='https://fanyi.baidu.com/sug'
data={
'kw':'西瓜'
} # 将参数封装到字典中
data=urllib.parse.urlencode(data) # 进行编码处理,得到字符串类型
data=data.encode() # 将字符串类型的转成bytes类型
response=urllib.request.urlopen(url=url,data=data)
ret=response.read() # 一组json字符串
with open('4.html','wb')as f:
f.write(ret)
import requests
url='https://www.sogou.com/'
response=requests.get(url=url)
page_data=response.text # 字符串形式的页面数据
with open('sogou.html','w',encoding='utf-8')as f:
f.write(page_data)
response.text # 字符串形式的页面数据
response.content # 二进制的页面数据
response.status_code # 返回相应状态码
response.headers # 响应头信息
response.url # 获取请求的url
方式一
import requests
url='https://www.sogou.com/web?query=周杰伦&ie=utf8'
response=requests.get(url=url)
page_text=response.text
方式二
import requests
url='https://www.sogou.com/web'
params={
'query':'周杰伦',
'ie':'utf8'
}
response=requests.get(url=url,params=params)
page_text=response.text
import requests
url='https://www.sogou.com/web'
params={
'query':'周杰伦',
'ie':'utf8'
}
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
response=requests.get(url=url,params=params,headers=headers)
page_text=response.text
先去找到url,得到data
import requests
url='https://www.douban.com/accounts/login'
data={
'source': 'index_nav',
'form_email': '18668573649@163.com',
'form_password': 'k365532902',
}
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
response=requests.post(url=url,data=data,headers=headers)
page_text=response.text
with open('douban.html','w',encoding='utf8')as f:
f.write(page_text)
要找到url和参数
import requests
url='https://movie.douban.com/j/chart/top_list?'
params={
'type': '17',
'interval_id': '100:90',
'action': '',
'start': '60',
'limit': '20',
}
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
response=requests.get(url=url,params=params,headers=headers)
page_text=response.text
import requests
url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
data={
'cname': '',
'pid': '',
'keyword': '宁波',
'pageIndex': '1',
'pageSize': '10',
}
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
response=requests.post(url=url,data=data,headers=headers)
page_text=response.text
根据指定的词条,获取一定范围的数据
import requests
import os
if not os.path.exists('pages'):
os.mkdir('pages')
word=input('enter a word') # 关键字
url='https://zhihu.sogou.com/zhihu'
start_pagenum=int(input('enter a start pagenum')) # 开始页码
end_pagenum=int(input('enter a end pagenum')) # 结束页码
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
for page in range(start_pagenum,end_pagenum+1):
params={
'query':word,
'pane':page,
'ie':'utf-8'
}
response=requests.get(url=url,params=params,headers=headers)
page_text = response.text
file_name=word+str(page)+'.html'
filepath='pages/'+file_name
with open(filepath,'w',encoding='utf-8')as f:
f.write(page_text)
被验证码拦截了
import requests
session = requests.session()
login_url = 'https://www.douban.com/accounts/login'
data = {
'source': 'None',
'redir': 'https://www.douban.com/people/186654449/',
'form_email': '18668573649@163.com',
'form_password': 'k365532902',
'captcha-solution': 'cough',
'captcha-id': 'TONAWBuNAp3yeI8r67VCHiYx:en',
'remember': 'on',
'login': '登录',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
login_response = session.post(url=login_url, data=data, headers=headers)
url = 'https://www.douban.com/people/186654449/'
response = session.get(url=url, headers=headers)
page_text = response.text
with open('5.html', 'w', encoding='utf8')as f:
f.write(page_text)
import requests
url = 'http://www.baidu.com/s?word=ip'
proxy = {
'http': '223.111.254.83:80',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
response = requests.get(url=url, proxies=proxy, headers=headers)
with open('daili.html', 'w', encoding='utf-8')as f:
f.write(response.text)
云打码平台
import requests, json, time, re
from lxml import etree
import yan
# 使用封装好的方法
def get_code(code_img):
username = 'qych1988gw'
# 密码
password = 'k365532902'
# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appid = 6570
# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
appkey = 'b1237a57c579e506735ffffed31b675c'
# 图片文件
filename = code_img
# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype = 3000
# 超时时间,秒
timeout = 20
# 检查
if (username == 'username'):
print('请设置好相关参数再测试')
else:
# 初始化
yundama = yan.YDMHttp(username, password, appid, appkey)
# 登陆云打码
uid = yundama.login();
print('uid: %s' % uid)
# 查询余额
balance = yundama.balance();
print('balance: %s' % balance)
# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
cid, result = yundama.decode(filename, codetype, timeout);
print('cid: %s, result: %s' % (cid, result))
return result
# 提取验证码图片
url = 'https://www.douban.com/accounts/login'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
code_image_url = tree.xpath('//*[@id="captcha_image"]/@src')[0]
# <img id="captcha_image" src="https://www.douban.com/misc/captcha?id=52yHi3MS1mj5PeSbamzYkPEp:en&size=s" alt="captcha" class="captcha_image">
code_img = requests.get(url=code_image_url, headers=headers).content
# 提取验证码id
c_id = re.findall('<img id="captcha_image".*?id=(.*?)&.*?>', page_text, re.S)[0]
with open('code_img.png', 'wb')as f:
f.write(code_img)
# 进行图片验证
codetext = get_code('code_img.png')
post = 'https://www.douban.com/accounts/login'
data = {
'source': 'None',
'redir': 'https://www.douban.com/people/186654449/',
'form_email': '18668573649@163.com',
'form_password': 'k365532902',
'captcha-solution': codetext,
'captcha-id': c_id,
'login': '登录'
}
login_text=requests.post(url=post,data=data,headers=headers).text
with open('login.html','w',encoding='utf-8')as f:
f.write(login_text)
下载图片
import requests, re, os
url = 'https://www.qiushibaike.com/pic/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
page_text = response.text
img_list = re.findall('<div class="thumb">.*?<img src="(.*?)".*?>.*?</div>', page_text, re.S)
if not os.path.exists('imgs'):
os.mkdir('imgs')
for i in img_list:
img_url = 'https:' + i
img_data = requests.get(url=img_url, headers=headers).content
img_name=i.split('/')[-1]
img_path='imgs/'+img_name
with open(img_path, 'wb')as f:
f.write(img_data)
from lxml import etree
创建etree对象进行制定数据的解析
返回列表
属性定位
找到class属性值为song的div标签//div[@class=“song”]
层级&索引定位
找到class属性值为tang的div的直系字标签ul下的第二个字标签li下的直系子标签a//div[@class="tang"]/ul/li[2]/a
逻辑运算
找到href属性值为空且class属性值为du的a标签//a[@href="" and @class="du"]
模糊匹配
//div[contains(@class,“ng”)]
//div[starts-with(@class,"ta")]
取文本
表示获取某个标签下的文本内容,或所有字标签下的所有内容//div[@class="song"]/p[1]/text()
`//div[@class="tang"]//text()
所有子标签的文本
取属性
//div[@class="tang"]//li[2]/a/@href
from lxml import etree
tree=etree.parse('text.html')
tree.xpath('//div[@class=“song”]')
import requests
from lxml import etree
url = 'https://ishuo.cn/joke'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@id="list"]/ul/li')
for li in li_list:
content = li.xpath('./div[@class="content"]/text()')[0]
title = li.xpath('./div[@class="info"]/a/text()')[0]
print(title,content)
python独有,更加简单便捷高效
html文档来源是本地,Beautiful(‘open(‘本地的html文件‘)‘,‘lxml‘)
html文件来源是网络,Beautiful(‘网络请求到的页面数据‘,‘lxml‘)
f = open('text.html')
soup = BeautifulSoup(f, 'lxml')
属性和方法:
根据标签名查找,只能找到第一个
soup.div
获取属性
soup.a.attrs
,所有属性,返回字典
soup.a.attrs[‘href‘]
,soup.a[‘href‘]
指定属性
获取内容
soup.a.string
,/text()
soup.a.text
,//text()
soup.a.get_text()
,//text()
find查找
soup.find(‘a‘)
soup.find(‘a‘,title="")
soup.find(‘a‘,alt="")
soup.find(‘a‘,class_="")
有个下划线
soup.find(‘a‘,id="")
find_all查找,返回列表
soup.find_All(‘a‘)
soup.find_All([‘a‘,‘div‘])
soup.find_All(‘a‘,limit=2)
根据选择器,返回列表
soup.select(‘#feng‘)
soup.select(‘div>img‘)
from bs4 import BeautifulSoup
import requests
url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(page_text, 'lxml')
a_list = soup.select('.book-mulu > ul > li > a')
def get_content(content_url):
content_page = requests.get(url=content_url, headers=headers).text
soup = BeautifulSoup(content_page, 'lxml')
div = soup.find('div', class_="chapter_content")
return div.text
f=open('sanguo.txt','w',encoding='utf-8')
for a in a_list:
title = a.string
a_url = a['href']
content_url = 'http://www.shicimingju.com' + a_url
content = get_content(content_url)
f.write(title+'\n\n'+content+'\n\n\n')
f.close()
处理页面动态加载数据的爬取
find_element_by_id
,根据id找节点find_element_by_name
,根据name找节点find_element_by_xpath
,根据xpath查找find_element_by_tag_name
,根据标签名查找find_element_by_class_name
,根据class名查找from selenium import webdriver
import time
bro = webdriver.Chrome(executable_path='chromedriver')
bro.get('http://www.baidu.com')
time.sleep(1)
text=bro.find_element_by_id('kw') # 获取input
text.send_keys('人民币') # 往input输入
time.sleep(1)
button=bro.find_element_by_id('su')
button.click() # 点击
time.sleep(3)
bro.quit() # 关闭浏览器
无界面浏览器,其自动化流程和谷歌浏览器一致,但是可截屏
from selenium import webdriver
import time
bro=webdriver.PhantomJS(executable_path=r'phantomjs-2.1.1-windows\bin\phantomjs.exe')
bro.get('http://www.baidu.com')
bro.save_screenshot('1.png')
time.sleep(1)
text=bro.find_element_by_id('kw') # 获取input
text.send_keys('人民币') # 往input输入
bro.save_screenshot('2.png')
time.sleep(1)
button=bro.find_element_by_id('su')
button.click() # 点击
bro.save_screenshot('3.png')
time.sleep(3)
bro.save_screenshot('3.png')
bro.quit() # 关闭浏览器
from selenium import webdriver
import time
bro = webdriver.PhantomJS(executable_path=r'phantomjs-2.1.1-windows\bin\phantomjs.exe')
url = 'https://movie.douban.com/typerank?type_name=%E7%A7%91%E5%B9%BB&type=17&interval_id=100:90&action='
bro.get(url)
time.sleep(1)
# 让浏览器对象执行js代码
js = 'window.scrollTo(0,document.body.scrollHeight)' # 滚轮到底部
for i in range(5):
bro.execute_script(js)
time.sleep(2)
bro.save_screenshot('2.png')
# 获取加载数据后的页面
page_text = bro.page_source
原文:https://www.cnblogs.com/qiuyicheng/p/10753105.html