# -*- coding: utf-8 -*- # @Time : 2019/6/11 9:47 # @Author : wujf # @Email : 1028540310@qq.com # @File : 斗罗大陆2.py # @Software: PyCharm import re import requests import urllib.request from bs4 import BeautifulSoup urls = [‘http://www.kuman.com/mh-1003692/{}/‘.format(str(i)) for i in range(1,22)] for url in urls: headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘} r =requests.get(url,headers =headers,timeout= 5) # r.raise_for_status() r.encoding = r.apparent_encoding #直接加密 content = r.text beaobj = BeautifulSoup(content.replace(‘ ‘, ‘ ‘),‘html5lib‘) lis = beaobj.findAll(‘li‘,style="margin-top: -3.6px") for li in lis: image = re.findall(r‘src="(.*?)"‘,str(li)) ################得到的无法判定对象数据类型,所以一定要加 str 否则报错 name = image[0].split(‘/‘)[-1] image_name = ‘E:\\Python\\python_image\\%s‘%name try: s= urllib.request.urlretrieve(image[0],image_name) print("正在下载%s"%(image[0])) except Exception as e: print(e)
后面付费怎么爬取 稍后更新
原文:https://www.cnblogs.com/wujf-myblog/p/11002313.html