目录
1.需要处理的数据
-
hello
word
-
word
count
-
hello
MapReduce
2.创建maven项目pom.xml
-
<repositories>
-
<repository>
-
<id>cloudera
</id>
-
<url>https://repository.cloudera.com/artifactory/cloudera-repos/
</url>
-
</repository>
-
</repositories>
-
<dependencies>
-
<dependency>
-
<groupId>org.apache.Hadoop
</groupId>
-
<artifactId>Hadoop-client
</artifactId>
-
<version>2.6.0-mr1-cdh5.14.0
</version>
-
</dependency>
-
<dependency>
-
<groupId>org.apache.Hadoop
</groupId>
-
<artifactId>Hadoop-common
</artifactId>
-
<version>2.6.0-cdh5.14.0
</version>
-
</dependency>
-
<dependency>
-
<groupId>org.apache.Hadoop
</groupId>
-
<artifactId>Hadoop-hdfs
</artifactId>
-
<version>2.6.0-cdh5.14.0
</version>
-
</dependency>
-
-
<dependency>
-
<groupId>org.apache.Hadoop
</groupId>
-
<artifactId>Hadoop-mapreduce-client-core
</artifactId>
-
<version>2.6.0-cdh5.14.0
</version>
-
</dependency>
-
<dependency>
-
<groupId>junit
</groupId>
-
<artifactId>junit
</artifactId>
-
<version>4.11
</version>
-
<scope>test
</scope>
-
</dependency>
-
<dependency>
-
<groupId>org.testng
</groupId>
-
<artifactId>testng
</artifactId>
-
<version>RELEASE
</version>
-
</dependency>
-
</dependencies>
-
<build>
-
<plugins>
-
<plugin>
-
<groupId>org.apache.maven.plugins
</groupId>
-
<artifactId>maven-compiler-plugin
</artifactId>
-
<version>3.0
</version>
-
<configuration>
-
<source>1.8
</source>
-
<target>1.8
</target>
-
<encoding>UTF-8
</encoding>
-
</configuration>
-
</plugin>
-
-
<plugin>
-
<groupId>org.apache.maven.plugins
</groupId>
-
<artifactId>maven-shade-plugin
</artifactId>
-
<version>2.4.3
</version>
-
<executions>
-
<execution>
-
<phase>package
</phase>
-
<goals>
-
<goal>shade
</goal>
-
</goals>
-
<configuration>
-
<minimizeJar>true
</minimizeJar>
-
</configuration>
-
</execution>
-
</executions>
-
</plugin>
-
-
</plugins>
-
</build>
3.编写map类
-
package com.czxy.wordCount;
-
-
import org.apache.hadoop.io.LongWritable;
-
import org.apache.hadoop.io.Text;
-
import org.apache.hadoop.mapreduce.Mapper;
-
-
import java.io.IOException;
-
-
public
class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
-
@Override
-
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
-
// 将 Text类型转换为String 类型
-
String s = value.toString();
-
// 安装空格切分
-
String[] split = s.split(
" ");
-
// 循环遍历输出
-
for (String s1 : split) {
-
// 输出 key=单词 value =1
-
context.write(
new Text(s1),
new LongWritable(
1));
-
}
-
}
-
}
4.编写Reduce类
-
package com.czxy.wordCount;
-
-
import org.apache.hadoop.io.LongWritable;
-
import org.apache.hadoop.io.Text;
-
import org.apache.hadoop.mapreduce.Reducer;
-
-
import java.io.IOException;
-
-
public
class WordCountReduce extends Reducer<Text, LongWritable,Text,LongWritable> {
-
@Override
-
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
-
// 定义一个变量用来记录单词出现的次数
-
int sumCount=
0;
-
for (LongWritable value : values) {
-
sumCount+=value.get();
-
}
-
// 结果数据
-
context.write(key,
new LongWritable(sumCount));
-
}
-
}
5.编写启动类
-
package com.czxy.wordCount;
-
-
import org.apache.hadoop.conf.Configuration;
-
import org.apache.hadoop.conf.Configured;
-
import org.apache.hadoop.fs.Path;
-
import org.apache.hadoop.io.LongWritable;
-
import org.apache.hadoop.io.Text;
-
import org.apache.hadoop.mapreduce.Job;
-
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-
import org.apache.hadoop.util.Tool;
-
import org.apache.hadoop.util.ToolRunner;
-
-
public
class WordCountDriver extends Configured implements Tool {
-
-
-
@Override
-
public int run(String[] args) throws Exception {
-
// 获取job
-
Job job = Job.getInstance(
new Configuration());
-
// 设置支持jar执行
-
job.setJarByClass(WordCountDriver.class);
-
// 设置执行的napper
-
job.setMapperClass(WordCountMapper.class);
-
// 设置map输出的key类型
-
job.setMapOutputKeyClass(Text.class);
-
// 设置map输出value类型
-
job.setMapOutputValueClass(LongWritable.class);
-
// 设置执行的reduce
-
job.setReducerClass(WordCountReduce.class);
-
// 设置reduce输出key的类型
-
job.setOutputKeyClass(Text.class);
-
// 设置reduce输出value的类型
-
job.setOutputValueClass(LongWritable.class);
-
// 设置文件输入
-
job.setInputFormatClass(TextInputFormat.class);
-
TextInputFormat.addInputPath(job,
new Path(
"./data/wordCount/"));
-
// 设置文件输出
-
job.setOutputFormatClass(TextOutputFormat.class);
-
TextOutputFormat.setOutputPath(job,
new Path(
"./outPut/wordCount/"));
-
// 设置启动类
-
boolean b = job.waitForCompletion(
true);
-
return b ?
0 :
1;
-
}
-
-
public static void main(String[] args) throws Exception {
-
// 调用启动方法
-
ToolRunner.run(
new WordCountDriver(), args);
-
}
-
}
6.执行的结果
-
MapReduce
1
-
count
1
-
hello
2
-
word
2
转载:https://blog.csdn.net/qq_43791724/article/details/104878338
查看评论