之前运行hadoop的方式是首先编写好程序,然后再将程序打包成jar包,然后上传到服务器中运行。
现在的有一种方法是通过在本地idea中可以将jar包提交到远程集群中运行。
一个简单的例子:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
public class WordCount {
public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private final static IntWritable result = new IntWritable(1);
private final static Text Key = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// super.map(key, value, context);
String line = value.toString();
String[] words = line.split(" ");
for(String word: words){
Key.set(word);
context.write(Key, result);
}
}
}
public static class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
private static final IntWritable result = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value: values){
sum += value.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
// 设置Hadoop的用户名
System.setProperty("HADOOP_USER_NAME", "611");
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.1.237:9000");
conf.set("mapreduce.app-submission.cross-platform", "true");
//jar包运行地址
conf.set("mapred.jar", "D:\\MyFile\\实验室项目\\2021大数据项目\\out\\artifacts\\wordCount\\wordCount.jar");
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
// 设置yarn连接
//指定是job运行在yarn上,默认local
conf.set("mapreduce.framework.name", "yarn");
//解决ould only be replicated to 0 nodes instead of minReplication (=1).
//There are 1 datanode(s) running and 1 node(s) are excluded in this operation问题
conf.set("yarn.resourcemanager.hostname", "master");
// client通过hostname可以连接datanode
conf.set("dfs.client.use.datanode.hostname", "true");
Job job = Job.getInstance(conf, "wordCount"); //任务名称
job.setJarByClass(WordCount.class);
String inputFile = "hdfs://192.168.1.237:9000/test/wordCount.txt";
String outFile = "hdfs://192.168.1.237:9000/test/output";
// 删除out目录
FileSystem fs = FileSystem.get(new URI("hdfs://192.168.1.237:9000"), conf);
if(fs.exists(new Path(outFile)))
fs.delete(new Path(outFile), true);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(inputFile));
FileOutputFormat.setOutputPath(job, new Path(outFile));
System.exit(job.waitForCompletion(true) ? 0: 1);
}
}
其中有一些设置关键的地方:
然后在这个程序中需要导入到包包括基本的hadoop、yarn的依赖包,具体需要不需要哪些,我把所有可能依赖的包全部都加入进去了,但是有一些依赖可能不是必须的,所以大家可以自己尝试一下。
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-shuffle</artifactId>
<version>2.7.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-client</artifactId>
<version>2.7.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-api -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<version>2.7.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.7</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-jobclient -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.7.7</version>
</dependency>
</dependencies>
在使用这个远程提交的时候,有一些yarn-site.xml和mapred-site.xml必须得配置好:
mapred-site.xml中的一些有关配置:
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 配置的是history的查看地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>master:10020</value>
</property>
<!-- 配置的是history网页的查看地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>master:19888</value>
</property>
</configuration>
yarn-site.xml中的一些有关配置:
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 配置yarn的resourcemanager的主机,这个是必须配置的 -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>master</value>
</property>
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<!-- 配置yarn运行过程中产生的记录聚合 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 聚合后的history的查看地址 -->
<property>
<name>yarn.log.server.url</name>
<value>master:19888/jobhistory</value>
</property>
如果需要启动hadoop的history记录服务,必须是单独的启动
${HADOOP_HOME}/sbin/mr-jobhistory-daemon.sh start historyserver
可以通过master:8088查看yarn运行的任务和状态
如果想要查看history的,必须开启history server,如上面的命令所示
原文:https://www.cnblogs.com/liulongtao/p/14823405.html