scrapy startproject tutorial该命令将会创建包含下列内容的 tutorial 目录:
tutorial/
scrapy.cfg
tutorial/
__init__.py
items.py
pipelines.py
settings.py
spiders/
__init__.py
...这些文件分别是:import scrapy
class DmozItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field()
import scrapy
class DmozSpider(scrapy.spiders.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
filename = response.url.split("/")[-2]
with open(filename, 'wb') as f:
f.write(response.body)
scrapy shell "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/"
for sel in response.xpath('//ul/li'):
title = sel.xpath('a/text()').extract()
link = sel.xpath('a/@href').extract()
desc = sel.xpath('text()').extract()
print title, link, desc
# -*- coding: UTF-8 -*-
import scrapy,sys
from scrapy.spider import Spider
from scrapy.selector import Selector
from tutorial.items import DmozItem
#设置编码格式
reload(sys)
sys.setdefaultencoding('gbk')
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
"""
The lines below is a spider contract. For more info see:
http://doc.scrapy.org/en/latest/topics/contracts.html
@url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
@scrapes name
"""
sel = Selector(response)
sites = sel.xpath('//ul/li')
for site in sites:
title = site.xpath('a/text()').extract()
link = site.xpath('a/@href').extract()
disc = site.xpath('text()').extract()
print("title= "+str(title)+"\tlink= "+str(link)+"\tdisc= "+str(disc)+"\n")
说明:
>>> item = DmozItem() >>> item['title'] = 'Example title' >>> item['title'] 'Example title'一般来说,Spider将会将爬取到的数据以 Item 对象返回。所以为了将爬取的数据返回,修改dmoz_spider.py中DmozSpider类的代码:
# -*- coding: UTF-8 -*-
import scrapy
from scrapy.spider import Spider
from scrapy.selector import Selector
from tutorial.items import DmozItem
#设置编码格式
reload(sys)
sys.setdefaultencoding('gbk')
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
"""
The lines below is a spider contract. For more info see:
http://doc.scrapy.org/en/latest/topics/contracts.html
@url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
@scrapes name
"""
sel = Selector(response)
sites = sel.xpath('//ul[@class="directory-url"]/li')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.xpath('a/text()').extract()
item['link'] = site.xpath('a/@href').extract()
item['desc'] = site.xpath('text()').re('-\s[^\n]*\\r')
items.append(item)
return items
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy import signals
import json, codecs
class TutorialPipeline(object):
def process_item(self, item, spider):
return item
class JsonWithEncodingTutorialPipeline(object):
def __init__(self):
self.file = codecs.open('dmoz.json','w',encoding='utf-8')
def process_item(self,item,spider):
line = json.dumps(dict(item),ensure_ascii=False)+'\n\n'
self.file.write(line)
return item
def spider_closed(self,spider):
self.file.close()打开tutuorial/tutorial/settings.py文件,在末尾追加部分代码:# -*- coding: utf-8 -*-
# Scrapy settings for tutorial project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
ITEM_PIPELINES = {
'tutorial.pipelines.JsonWithEncodingTutorialPipeline': 300,
}
LOG_LEVEL = 'INFO'原文:http://blog.csdn.net/fly_yr/article/details/51540269