Python 爬虫网页抓图保存

时间：2014-11-16 23:06:32 阅读：453 评论：0 收藏：0 [点我收藏+]

网站选择桌面壁纸网站的汽车主题：

下面的两个print在调试时打开

#print tag
#print attrs

#!/usr/bin/env python
import re
import urllib2
import HTMLParser
base = "http://desk.zol.com.cn"
path = '/home/mk/cars/'
star = ''
def get_url(html):
	parser = parse(False)
	request = urllib2.Request(html)
	response = urllib2.urlopen(request)
	resp = response.read()
	parser.feed(resp)
def download(url):
	content = urllib2.urlopen(url).read()
	format = '[0-9]*\.jpg';
	res = re.search(format,url);
	print 'downloading:',res.group()
	filename = path+res.group()
	f = open(filename,'w+')
	f.write(content)
	f.close()	 
class parse(HTMLParser.HTMLParser):
	def __init__(self,Index):
		self.Index = Index;
		HTMLParser.HTMLParser.__init__(self)
	def handle_starttag(self,tag,attrs):
		#print tag
		#print attrs
		if(self.Index):
			if not cmp(tag,'a'):
				if(len(attrs) == 4):
					if(attrs[0] ==('class','pic')):
						#print tag
						#print attrs
						new = base+attrs[1][1]
						print 'found a link:',new
						global star
						star = new
						get_url(new)
		else:
			if not cmp(tag,'img'):
				if(attrs[0] == ('id','bigImg')):
					#print tag
					#print attrs
					Image_url = attrs[1][1]
					print 'found a picture:',Image_url
					download(Image_url)
			if not cmp(tag,'a'):
				if (len(attrs) == 4):
					if (attrs[1] == ('class','next')):
						#print tag
						#print attrs
						next = base + attrs[2][1]
						print 'found a link:',next
						if (star != next):
							get_url(next)
Index_url = 'http://desk.zol.com.cn/qiche/'
con = urllib2.urlopen(Index_url).read()
Parser_index = parse(True)
Parser_index.feed(con)

bubuko.com,布布扣

仅仅就是抓桌面壁纸网站上的优美的壁纸。。。

Python 爬虫网页抓图保存

原文：http://blog.csdn.net/wu20093346/article/details/41179367

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)