1.环境搭建:ant,从http://ant.apache.org/下载apache-ant-1.9.9-bin.zip;解压指定目录,配置环境变量,ANT_HOME : F:\life\rainofsky\apache-ant-1.9.9,path中新增:%ANT_HOME%\bin。
<dependency org="org.apache.gora" name="gora-sql" rev="0.1.1-incubating" conf="*->default" /> <dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/>
3.在Nutch根目录:打开命令窗口:运行:ant eclipse -verbose
<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <!-- Put site-specific property overrides in this file. --> <configuration> <!--瓟虫的名字--> <property> <name>http.agent.name</name> <value>mySplider</value> </property> <!--瓟虫接受的语言--> <property> <name>http.accept.language</name> <value>ja-jp, en-us,en-gb,en,zh-cn,zh-tw;q=0.7,*;q=0.3</value> <description>Value of the “Accept-Language” request header field. This allows selecting non-English language as default one to retrieve. It is a useful setting for search engines build for certain national group.</description> </property> <!--瓟虫文本的编码--> <property> <name>parser.character.encoding.default</name> <value>utf-8</value> <description>The character encoding to fall back to when no other information is available</description> </property> <!--瓟虫插件的目录--> <property> <name>plugin.folders</name> <value>src/plugin</value> <description>Directories where nutch plugins are located. Each element may be a relative or absolute path. If absolute, it is used as is. If relative, it is searched for on the classpath.</description> </property> <!--瓟虫存储指定用sql--> <property> <name>storage.data.store.class</name> <value>org.apache.gora.sql.store.SqlStore</value> <description>The Gora DataStore class for storing and retrieving data. Currently the following stores are available: ….</description> </property> <!--生成的批次id--> <property> <name>generate.batch.id</name> <value>*</value> </property> </configuration>
6.配置 conf/gora.properties
gora.datastore.default=org.apache.gora.sql.store.SqlStore gora.datastore.autocreateschema=true gora.sqlstore.jdbc.driver=com.mysql.jdbc.Driver gora.sqlstore.jdbc.url=jdbc:mysql://localhost:3306/nutch?createDatabaseIfNotExist=true&useUnicode=true&characterEncoding=utf8&autoReconnect=true&zeroDateTimeBehavior=convertToNull gora.sqlstore.jdbc.user=root gora.sqlstore.jdbc.password=password
CREATE TABLE webpage ( id varchar(256) NOT NULL, headers blob, text longtext DEFAULT NULL, status int(11) DEFAULT NULL, markers blob, parseStatus blob, modifiedTime bigint(20) DEFAULT NULL, prevModifiedTime bigint(20) DEFAULT NULL, score float DEFAULT NULL, typ varchar(32) CHARACTER SET latin1 DEFAULT NULL, batchId varchar(32) CHARACTER SET latin1 DEFAULT NULL, baseUrl varchar(256) DEFAULT NULL, content longblob, title text DEFAULT NULL, reprUrl varchar(256) DEFAULT NULL, fetchInterval int(11) DEFAULT NULL, prevFetchTime bigint(20) DEFAULT NULL, inlinks mediumblob, prevSignature blob, outlinks mediumblob, fetchTime bigint(20) DEFAULT NULL, retriesSinceFetch int(11) DEFAULT NULL, protocolStatus blob, signature blob, metadata blob, PRIMARY KEY (id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;