# coding=utf-8 import urllib2 as request import re import os import threading,time,random #### config_url_paths = [ r‘‘‘http://image.baidu.com/‘‘‘, ] config_save_path = r‘‘‘D:\\video\\t\\web\\image_cool\\52\\‘‘‘ re_fliter_jpg_full_path = re.compile(r‘src="(.+?\.jpg)"‘) re_filter_jpg_name = re.compile(r‘/([^/]+\.jpg)‘) class jpg_downloader(threading.Thread): def __init__(self, url, filename): global cnt_threads,mutex threading.Thread.__init__(self) cnt_threads = cnt_threads + 1 savepath = config_save_path + filename; self._url = url; self._savepath = savepath; self._id = cnt_threads; print(‘cnt:‘+str(self._id)+‘ url:‘+url+‘ path:‘+savepath+‘\r\n‘); def run(self): # global count,mutex # threadname = threading.currentThread.getName(); jpg = request.urlopen(self._url).read() print(str(self._id) + ‘download finish \r\n‘) File = open(self._savepath,‘wb‘) File.write(jpg) File.flush() File.close() print(str(self._id) + ‘thread_end \r\n‘) def get_html(url): page = request.urlopen(url) html = page.read() return html def getImg(html): imglist = re.findall(re_fliter_jpg_full_path,html) return imglist def downloads(urls): global cnt_threads,mutex cnt = 0 threads = [] cnt_threads = 0; mutex = threading.Lock() for url in urls: filename = re.search(re_filter_jpg_name,url).group(1) filename = ‘%03d‘%cnt + "-" + filename threads.append(jpg_downloader(url,filename)); cnt = cnt + 1 for t in threads: t.start() for t in threads: t.join() print(‘join‘) return print(‘hello ready to start‘) img_list = [] for url in config_url_paths: html = get_html(url) img_targets = getImg(html) for img in img_targets: img_list.append(img) print(len(img_list)) downloads(img_list) print("finish")
原文:http://my.oschina.net/mummy108/blog/529508