优化后的代码如下,
先用循环创建20个目录,然后循环写入这20个目录,每个最多写入50000
#coding=utf-8 import os import random import sys import urllib.request from bs4 import BeautifulSoup from urllib import error import re ls = [‘zhenrenxiu‘,‘meinv‘,"lianglichemo",‘rentiyishu‘,‘xiaohua‘,‘lianglichemo‘] file_list = os.listdir("d:\\craw\\") def validateTitle(title): rstr = r"[\/\\\:\*\?\"\<\>\|]" # ‘/ \ : * ? " < > |‘ new_title = re.sub(rstr, "_", title) # 替换为下划线 return new_title def get_file_name(): file = random.sample(file_list,1)[0] path = ‘d://craw/‘+ str(file); if os.path.isdir(path): total_num = len(os.listdir(‘d://‘+ str(file))) if total_num >= 50000: file = get_file_name() else: os.mkdir(path) print("创建目录"+ str(path)) return str(path)+‘/‘ for j in range(1,100000): url_origin = "http://www.7160.com/meinv/"+str(j) try: page_obj = urllib.request.urlopen(url_origin) page_soup = BeautifulSoup(page_obj,‘lxml‘) total_page_obj = page_soup.find(text=re.compile(‘共‘)).string pattern = re.compile(r‘\d+‘) match = pattern.search(total_page_obj) if match == None: total_page = 0; else: total_page = match.group(); for i in range(1,int(total_page)+1): if i == 1 : url = url_origin+"/index.html" else: url = url_origin+"/index_"+str(i)+".html" request = urllib.request.Request(url) try: res = urllib.request.urlopen(request) soup = BeautifulSoup(res,‘lxml‘) title_obj = soup.find(attrs={"class":"picmainer"}) if title_obj is not None: print(url) title = title_obj.h1.string content = soup.find(‘img‘) src = content.get("src") file_name = validateTitle(title)+".jpg" urllib.request.urlretrieve(src, str(get_file_name())+file_name) print(str(get_file_name())+file_name+"保存成功") except Exception as e: print("异常"+str(e)) except Exception as e: print("异常"+str(e))