飞道的博客

快速入门MapReduc① 实现WordCount

374人阅读  评论(0)

目录

1.需要处理的数据

2.创建maven项目pom.xml

3.编写map类

4.编写Reduce类

5.编写启动类

6.执行的结果


1.需要处理的数据


  
  1. hello word
  2. word count
  3. hello MapReduce

2.创建maven项目pom.xml


  
  1. <repositories>
  2. <repository>
  3. <id>cloudera </id>
  4. <url>https://repository.cloudera.com/artifactory/cloudera-repos/ </url>
  5. </repository>
  6. </repositories>
  7. <dependencies>
  8. <dependency>
  9. <groupId>org.apache.Hadoop </groupId>
  10. <artifactId>Hadoop-client </artifactId>
  11. <version>2.6.0-mr1-cdh5.14.0 </version>
  12. </dependency>
  13. <dependency>
  14. <groupId>org.apache.Hadoop </groupId>
  15. <artifactId>Hadoop-common </artifactId>
  16. <version>2.6.0-cdh5.14.0 </version>
  17. </dependency>
  18. <dependency>
  19. <groupId>org.apache.Hadoop </groupId>
  20. <artifactId>Hadoop-hdfs </artifactId>
  21. <version>2.6.0-cdh5.14.0 </version>
  22. </dependency>
  23. <dependency>
  24. <groupId>org.apache.Hadoop </groupId>
  25. <artifactId>Hadoop-mapreduce-client-core </artifactId>
  26. <version>2.6.0-cdh5.14.0 </version>
  27. </dependency>
  28. <dependency>
  29. <groupId>junit </groupId>
  30. <artifactId>junit </artifactId>
  31. <version>4.11 </version>
  32. <scope>test </scope>
  33. </dependency>
  34. <dependency>
  35. <groupId>org.testng </groupId>
  36. <artifactId>testng </artifactId>
  37. <version>RELEASE </version>
  38. </dependency>
  39. </dependencies>
  40. <build>
  41. <plugins>
  42. <plugin>
  43. <groupId>org.apache.maven.plugins </groupId>
  44. <artifactId>maven-compiler-plugin </artifactId>
  45. <version>3.0 </version>
  46. <configuration>
  47. <source>1.8 </source>
  48. <target>1.8 </target>
  49. <encoding>UTF-8 </encoding>
  50. </configuration>
  51. </plugin>
  52. <plugin>
  53. <groupId>org.apache.maven.plugins </groupId>
  54. <artifactId>maven-shade-plugin </artifactId>
  55. <version>2.4.3 </version>
  56. <executions>
  57. <execution>
  58. <phase>package </phase>
  59. <goals>
  60. <goal>shade </goal>
  61. </goals>
  62. <configuration>
  63. <minimizeJar>true </minimizeJar>
  64. </configuration>
  65. </execution>
  66. </executions>
  67. </plugin>
  68. </plugins>
  69. </build>

3.编写map类


  
  1. package com.czxy.wordCount;
  2. import org.apache.hadoop.io.LongWritable;
  3. import org.apache.hadoop.io.Text;
  4. import org.apache.hadoop.mapreduce.Mapper;
  5. import java.io.IOException;
  6. public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
  7. @Override
  8. protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
  9. // 将 Text类型转换为String 类型
  10. String s = value.toString();
  11. // 安装空格切分
  12. String[] split = s.split( " ");
  13. // 循环遍历输出
  14. for (String s1 : split) {
  15. // 输出 key=单词 value =1
  16. context.write( new Text(s1), new LongWritable( 1));
  17. }
  18. }
  19. }

4.编写Reduce类


  
  1. package com.czxy.wordCount;
  2. import org.apache.hadoop.io.LongWritable;
  3. import org.apache.hadoop.io.Text;
  4. import org.apache.hadoop.mapreduce.Reducer;
  5. import java.io.IOException;
  6. public class WordCountReduce extends Reducer<Text, LongWritable,Text,LongWritable> {
  7. @Override
  8. protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
  9. // 定义一个变量用来记录单词出现的次数
  10. int sumCount= 0;
  11. for (LongWritable value : values) {
  12. sumCount+=value.get();
  13. }
  14. // 结果数据
  15. context.write(key, new LongWritable(sumCount));
  16. }
  17. }

5.编写启动类


  
  1. package com.czxy.wordCount;
  2. import org.apache.hadoop.conf.Configuration;
  3. import org.apache.hadoop.conf.Configured;
  4. import org.apache.hadoop.fs.Path;
  5. import org.apache.hadoop.io.LongWritable;
  6. import org.apache.hadoop.io.Text;
  7. import org.apache.hadoop.mapreduce.Job;
  8. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
  9. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
  10. import org.apache.hadoop.util.Tool;
  11. import org.apache.hadoop.util.ToolRunner;
  12. public class WordCountDriver extends Configured implements Tool {
  13. @Override
  14. public int run(String[] args) throws Exception {
  15. // 获取job
  16. Job job = Job.getInstance( new Configuration());
  17. // 设置支持jar执行
  18. job.setJarByClass(WordCountDriver.class);
  19. // 设置执行的napper
  20. job.setMapperClass(WordCountMapper.class);
  21. // 设置map输出的key类型
  22. job.setMapOutputKeyClass(Text.class);
  23. // 设置map输出value类型
  24. job.setMapOutputValueClass(LongWritable.class);
  25. // 设置执行的reduce
  26. job.setReducerClass(WordCountReduce.class);
  27. // 设置reduce输出key的类型
  28. job.setOutputKeyClass(Text.class);
  29. // 设置reduce输出value的类型
  30. job.setOutputValueClass(LongWritable.class);
  31. // 设置文件输入
  32. job.setInputFormatClass(TextInputFormat.class);
  33. TextInputFormat.addInputPath(job, new Path( "./data/wordCount/"));
  34. // 设置文件输出
  35. job.setOutputFormatClass(TextOutputFormat.class);
  36. TextOutputFormat.setOutputPath(job, new Path( "./outPut/wordCount/"));
  37. // 设置启动类
  38. boolean b = job.waitForCompletion( true);
  39. return b ? 0 : 1;
  40. }
  41. public static void main(String[] args) throws Exception {
  42. // 调用启动方法
  43. ToolRunner.run( new WordCountDriver(), args);
  44. }
  45. }

6.执行的结果


  
  1. MapReduce 1
  2. count 1
  3. hello 2
  4. word 2


转载:https://blog.csdn.net/qq_43791724/article/details/104878338
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场