第一个反爬机制
编码流程:
jupyter快捷键
插入cell:a,b
删除:x或者dd
dell模式切换:m y
执行cell:shift+enter
tab:代码补全
打开帮助文档:shift+tab
爬取搜狗主页案例
#1.指定url
url = 'https://www.sogou.com'
#2.发起请求
response = requests.get(url=url)
#3.获取响应数据
page_text = response.text
#4.持久化存储
with open('./sogou.html','w',encoding='utf-8') as fp:
fp.write(page_text)
动态爬取搜狗搜索主页(简易的网页采集器)
#User-Agent:请求载体的身份标识
#UA检测:门户网站的服务器端会检测每一个请求的UA,如果检测到请求的UA为爬虫程序,则请求失败
#UA伪装:
#简易的网页采集器
wd = input('enter a word:')
url = 'https://www.sogou.com/web'
#将请求参数设定成动态的
param = {
"query":wd
}
#UA伪装
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
response = requests.get(url=url,params=param,headers=headers)
#手动设置响应数据的编码,处理中文乱码
response.encoding = 'utf-8'
#text返回的事字符串形式的响应数据
page_text = response.text
filename = wd+'.html'
with open(filename,'w',encoding='utf-8') as fp:
fp.write(page_text)
print(filename,'下载成功')
爬取肯德基餐厅位置信息
#爬取肯德基餐厅位置信息
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
city = input('enter a city name')
for pageIndex in range(1,9):
data = {
"cname": "",
"pid": "",
"keyword": city,
"pageIndex": pageIndex,
"pageSize": "10",
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
response = requests.post(url=url,data=data,headers=headers)
#json() 返回的事一个json对象类型
page_text = response.json()
# print(page_text)
fp = open('./kfc.txt','a+',encoding='utf-8')
kfc_dic = {}
for dic in page_text['Table1']:
kfc_dic[dic['storeName']+'餐厅'] = dic['addressDetail']
fp.write(str(kfc_dic))
fp.close()
爬取豆瓣排行榜电影的信息
import requests
url = 'https://movie.douban.com/j/chart/top_list'
s = 1
limit = 100
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
param = {
"type": "5",
"interval_id": "100:90",
"action": "",
"start": s,
"limit": limit,
}
response = requests.get(url=url,headers=headers,params=param)
page_text = response.json()
print(page_text)
动态加载的页面数据
爬取药监局的所有企业信息
import requests
url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
def func_list(f):
for i in range(1,330):
data = {
"on": "true",
"page": i,
"pageSize": "15",
"productName":" ",
"conditionType": "1",
"applyname":" ",
"applysn":" "
}
page_text = requests.post(url=url,data=data,headers=headers)
try:
if page_text.text :
for dic in page_text.json()['list']:
_id = dic['ID']
detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
data = {'id': _id}
detail_text = requests.post(url=detail_url, data=data, headers=headers).json()
print(str(detail_text))
f.write(str(detail_text))
except:
continue
with open('./particulars.txt','w',encoding='utf-8') as f:
func_list(f)
原文:https://www.cnblogs.com/Godisgirl/p/11006855.html