# -*- coding: utf-8 -*-
# @Time : 2019/6/11 9:47
# @Author : wujf
# @Email : 1028540310@qq.com
# @File : 斗罗大陆2.py
# @Software: PyCharm
import re
import requests
import urllib.request
from bs4 import BeautifulSoup
urls = [‘http://www.kuman.com/mh-1003692/{}/‘.format(str(i)) for i in range(1,22)]
for url in urls:
headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘}
r =requests.get(url,headers =headers,timeout= 5)
# r.raise_for_status()
r.encoding = r.apparent_encoding #直接加密
content = r.text
beaobj = BeautifulSoup(content.replace(‘ ‘, ‘ ‘),‘html5lib‘)
lis = beaobj.findAll(‘li‘,style="margin-top: -3.6px")
for li in lis:
image = re.findall(r‘src="(.*?)"‘,str(li)) ################得到的无法判定对象数据类型,所以一定要加 str 否则报错
name = image[0].split(‘/‘)[-1]
image_name = ‘E:\\Python\\python_image\\%s‘%name
try:
s= urllib.request.urlretrieve(image[0],image_name)
print("正在下载%s"%(image[0]))
except Exception as e:
print(e)
后面付费怎么爬取 稍后更新
原文:https://www.cnblogs.com/wujf-myblog/p/11002313.html