晚上写了一个代理池,就是在一个代理网站上爬取代理ip和端口以及测试是否可用。接下来可能考虑扩展成一个比较大的 golang实现的代理池。
简易版代码:
1 package main 2 3 import ( 4 "os" 5 "fmt" 6 log "github.com/Sirupsen/logrus" 7 "io/ioutil" 8 "strings" 9 ) 10 11 type New struct { 12 Prefix string 13 NewId string 14 Title string 15 Time string 16 Content string 17 Subject string 18 } 19 20 type Subject struct { 21 Name string 22 Url string 23 } 24 25 func CreateDir(PathName string) error { 26 err := os.Mkdir(PathName, 0777) 27 if err != nil && !os.IsExist(err) { 28 return err 29 } 30 return nil 31 } 32 33 func AppendFile(SavePath string, FileName string, buf string) { 34 out, err := os.OpenFile(SavePath+FileName, os.O_WRONLY, 0644) 35 defer out.Close() 36 if err != nil { 37 log.Errorln(err.Error()) 38 return 39 } 40 offset, err := out.Seek(0, os.SEEK_END) 41 if err != nil { 42 log.Errorln(err.Error()) 43 return 44 } 45 _, err = out.WriteAt([]byte(buf), offset) 46 if err != nil { 47 log.Errorln(err.Error()) 48 return 49 } 50 log.Warnln("Save file finished. Locate in ", SavePath + FileName) 51 } 52 53 func PathExists(path string) bool { 54 _, err := os.Stat(path) 55 if err == nil { 56 return true 57 } 58 if os.IsNotExist(err) { 59 return false 60 } 61 return false 62 } 63 64 func SaveFile(SavePath string, FileName string, buf string) { 65 out, err := os.Create(SavePath + FileName) 66 defer out.Close() 67 fmt.Fprintf(out, "%s", buf) 68 if err != nil { 69 log.Errorln(err.Error()) 70 return 71 } 72 log.Warnln("Save file finished. Locate in ", SavePath + FileName) 73 } 74 75 func ReadAll(path string) ([]byte, error) { 76 f, err := os.Open(path) 77 if err != nil { 78 return nil, err 79 } 80 defer f.Close() 81 return ioutil.ReadAll(f) 82 } 83 84 func ReadFile(path string) []string { 85 var fp interface{} 86 fp, err := ReadAll(path) 87 if err != nil { 88 log.Errorln(err.Error()) 89 return nil 90 } 91 fp = string(fp.([]byte)) 92 return strings.Split(fp.(string), "\n") 93 }
1 package main 2 3 import ( 4 log "github.com/Sirupsen/logrus" 5 "math/rand" 6 "net/http" 7 "net/url" 8 "time" 9 ) 10 11 var userAgent = [...]string { 12 "Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)", 13 "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)", 14 "Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)", 15 "Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,", 16 "Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11", 17 "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)", 18 "Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 19 "Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 20 "Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 21 "Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 22 "Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 23 "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)", 24 "Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 25 "MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 26 } 27 28 func GetRandomUserAgent() string { 29 var r = rand.New(rand.NewSource(time.Now().UnixNano())) 30 return userAgent[r.Intn(len(userAgent))] 31 } 32 33 func GetFakeHeader(request *http.Request) { 34 request.Header.Set("User-Agent", GetRandomUserAgent()) 35 request.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 36 request.Header.Set("Connection", "keep-alive") 37 //request.Header.Set("Accept-Encoding", "gzip, deflate") 38 } 39 40 func GetByProxy(proxyAddr string, Url string) (*http.Response, error) { 41 timeout := time.Duration(10 * time.Second) 42 request, err := http.NewRequest(http.MethodGet, Url, nil) 43 GetFakeHeader(request) 44 if err != nil { 45 return nil, err 46 } 47 proxy, err := url.Parse(proxyAddr) 48 if err != nil { 49 log.Errorln(err.Error()) 50 } 51 client := &http.Client{ 52 Transport: &http.Transport{ 53 Proxy: http.ProxyURL(proxy), 54 }, 55 Timeout: timeout, 56 } 57 return client.Do(request) 58 } 59 60 func GetByDirectory(Url string) (*http.Response, error) { 61 timeout := time.Duration(10 * time.Second) 62 request, err := http.NewRequest(http.MethodGet, Url, nil) 63 GetFakeHeader(request) 64 if err != nil { 65 return nil, err 66 } 67 client := http.Client{ 68 Timeout: timeout, 69 } 70 return client.Do(request) 71 }
1 package main 2 3 import ( 4 log "github.com/Sirupsen/logrus" 5 "strconv" 6 "regexp" 7 "github.com/opesun/goquery" 8 "time" 9 ) 10 11 const ( 12 SAVE_PATH = "kproxy.orz" 13 PROXY_URL = "http://www.kuaidaili.com/free/inha/" 14 ) 15 var ( 16 IP_REGEXP = regexp.MustCompile(`[\d]+\.[\d]+\.[\d]+\.[\d]+\n\s+[\d]+`) 17 IP_DETAIL_REGEXP = regexp.MustCompile(`[\d]+\.[\d]+\.[\d]+\.[\d]+`) 18 INT_REGEXP = regexp.MustCompile(`\s[\d]+`) 19 ) 20 21 func UrlGetter(num int) string { 22 return PROXY_URL + strconv.Itoa(num) 23 } 24 25 func GetProxy(Url string) { 26 nod, err := goquery.ParseUrl(Url) 27 if err != nil { 28 log.Errorln(err.Error()) 29 return 30 } 31 ret := nod.Text() 32 ips := IP_REGEXP.FindAll([]byte(ret), -1) 33 var port []string = make([]string, len(ips)) 34 var str string = "" 35 for i := 0; i < len(ips); i++ { 36 port[i] = string(INT_REGEXP.FindAll(ips[i], -1)[0])[1:] 37 ips[i] = IP_DETAIL_REGEXP.FindAll(ips[i], -1)[0] 38 str += string(ips[i])+":"+port[i]+"\n" 39 } 40 AppendFile("./", SAVE_PATH, str) 41 } 42 43 func main() { 44 log.Infoln("Start getting proxy ...") 45 SaveFile("./", SAVE_PATH, "") 46 for i := 1; i <= 500; i++ { 47 log.Println(UrlGetter(i)) 48 GetProxy(UrlGetter(i)) 49 time.Sleep(time.Second*5) 50 } 51 }
这里有个python来测试:
1 import urllib 2 import urllib2 3 import os 4 import socket 5 6 AIM_URL = ‘‘ 7 PROXY_PATH = ‘./kproxy.orz‘ 8 9 class MyException(Exception): 10 pass 11 12 13 def read_file(path): 14 if not os.path.exists(path): 15 print ‘path : \‘‘+ path + ‘\‘ not find.‘ 16 return [] 17 content = ‘‘ 18 try: 19 with open(path, ‘r‘) as fp: 20 content += reduce(lambda x,y:x+y, fp) 21 finally: 22 fp.close() 23 return content.split(‘\n‘) 24 25 socket.setdefaulttimeout(5) 26 proxies = read_file(PROXY_PATH) 27 print len(proxies) 28 for pp in proxies: 29 try: 30 print ‘http://‘+pp 31 inforMation = urllib.urlopen(AIM_URL, proxies={‘http‘: ‘http://‘+pp}) 32 except urllib2.URLError, e: 33 if isinstance(e.reason, socket.timeout): 34 pass 35 except: 36 pass 37 finally: 38 pass
原文:http://www.cnblogs.com/kirai/p/6193343.html