import re import os import sys import time import threading import socket import urllib import urllib2 server = ‘127.0.0.1‘ port = ‘8087‘ timeout = 720 socket.setdefaulttimeout(timeout) class timer(threading.Thread): #The timer class is derived from the class threading.Thread def __init__(self, num, interval,dir,url): threading.Thread.__init__(self) self.thread_num = num self.interval = interval self.url = url self.dir = dir self.thread_stop = False def run(self): #Overwrite run() method, put what you want the thread do here #while not self.thread_stop: DownloadFile(self.interval,self.url,self.dir) #print ‘Thread Object(%d), Time:%s‘ %(self.thread_num, time.ctime()) #time.sleep(self.interval) def stop(self): self.thread_stop = True def getContent(url,type): print(">>start connecting:%s" % url) from urllib2 import Request, urlopen, URLError, HTTPError #proxy = urllib2.ProxyHandler({‘http‘:‘http://127.0.0.1:8087‘}) proxy = urllib2.ProxyHandler({}) opener = urllib2.build_opener(proxy,urllib2.HTTPHandler) urllib2.install_opener(opener) try: urlHandler = urllib2.urlopen(url) headers = urlHandler.info().headers length = 0 for header in headers: if header.find(‘Length‘) != -1: length = header.split(‘:‘)[-1].strip() length = int(length) if(type=="img" and length<15000): print(" >>>>>>>>%d" % length) dataStr = ‘EOF‘ else: print(" ++++++++%d" % length) dataStr = urlHandler.read() except HTTPError, e: print ‘The server couldn\‘t fulfill the request.‘ print ‘Error code: ‘, e.code except URLError, e: print ‘We failed to reach a server.‘ print ‘Reason: ‘, e.reason else: # print("%s" % dataStr) # f = open("text.txt",‘wb‘) # f.write(dataStr) # f.close() return dataStr def DownloadFile(interval,url,dir): strinfo = re.compile(r‘\S*/blob/master/‘) dataStr = getContent(url,"html") print("...:%s" % url) #download Files base = url.replace(‘https://github.com‘,‘‘).replace(‘/tree/master/‘,‘/blob/master/‘).strip() reg = r‘href="%s(\S+)"‘ % base imgre = re.compile(reg) imglist = imgre.findall(dataStr) x = 0 for fileName in imglist: javaFileUrl = "%s%s%s" % (‘https://raw.githubusercontent.com‘,base.replace(‘/blob/master/‘,‘/master/‘),fileName) imgdata=getContent(javaFileUrl,"html") if(imgdata != ‘EOF‘): outputFile = ‘%s%s‘ % (dir,strinfo.sub(‘/‘,base)) if not os.path.exists(outputFile): os.makedirs(outputFile); f = open(‘%s%s‘ % (outputFile,fileName),‘wb‘) f.write(imgdata) f.close() x = x + 1 time.sleep(interval) #download recursive base = url.replace(‘https://github.com‘,‘‘).strip() reg = r‘href="%s(\S+)"‘ % base imgre = re.compile(reg) imglist = imgre.findall(dataStr) for fileDir in imglist: DownloadFile(interval,‘%s%s‘ % (url,fileDir),dir) #https://raw.githubusercontent.com/vogella/vogella/master/de.vogella.rcp.editor.example/src/de/vogella/rcp/editor/example/Application.java #https://raw.githubusercontent.com/clojure/clojure/master/src/jvm/clojure/lang/Util.java #https://github.com/vogella/vogella/blob/master/de.vogella.rcp.editor.example/src/de/vogella/rcp/editor/example/Activator.java url_="https://github.com/vogella/vogella/tree/master/de.vogella.rcp.editor.example/src" n=1; thread=[] for i in range(0, n): url=url_ dir=‘‘.join(‘FILE‘) thread.append(timer(1, 1,dir,url)) for i in range(0, n): thread[i].start()
github 资源文件下载(python 爬虫),布布扣,bubuko.com
原文:http://blog.csdn.net/kevinkitty_love/article/details/23249603