1.
import urllib.request from bs4 import BeautifulSoup import re import os url=‘http://cpc.people.com.cn/n1/2018/0318/c64094-29873799-8.html‘ #GB2312 head={} head[‘User_Agent‘]=‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6‘ #设置代理,假装是用户访问. 注意,服务器会限制部分user-agent,如果程序报错,就换一个。 #user-agent的获取可参考以下网址:http://blog.csdn.net/bone_ace/article/details/52476016 req=urllib.request.Request(url,headers=head)#给请求装上代理 response=urllib.request.urlopen(url)#打开url res=response.read().decode(‘gb2312‘,"ignore")#读取网页内容,用utf-8解码成字节 soup=BeautifulSoup(res,‘lxml‘)
2.
import requests #res = requests.get(‘http://ldzl.people.com.cn/dfzlk/front/personProvince2580.htm‘) #print(res.encoding) import urllib.request from bs4 import BeautifulSoup import re import os url=‘http://ldzl.people.com.cn/dfzlk/front/personProvince2580.htm‘#云南 #GBk head={} head[‘User_Agent‘]=‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6‘ req=urllib.request.Request(url,headers=head)#给请求装上代理 r = requests.get(url, timeout=30) r.encoding = ‘GBK‘ soup = BeautifulSoup(r.text, ‘lxml‘)
原文:https://www.cnblogs.com/polipolu/p/12968677.html