springboot 搜索引擎项目使用到的技术

2023-01-05 16:51 479人阅读评论(0)

构建索引模块

SSM框架

构建索引模块

SSM框架

首先我们用到了很多 SSM 框架的注解简化了很多重复的工作，同时我们不必关心对象的管理，这些都由 IoC 容器（Spring）完成了。

Spring 中常用注解：

存取对象相关的：


  
   
    
     
    
    
     
      // 存对象用到的注解
     
    
   
    
     
    
    
     
      // 五大类注解
     
    
   
    
     
    
    
     
      // 控制器
     
    
   
    
     
    
    
     
      @Controller
     
    
   
    
     
    
    
     
      // 配置相关的 
     
    
   
    
     
    
    
     
      @Configuration
     
    
   
    
     
    
    
     
      // 组件，经常用到的工具类
     
    
   
    
     
    
    
     
      @Component
     
    
   
    
     
    
    
     
      // 服务层，对数据进行组合转换等处理
     
    
   
    
     
    
    
     
      @Service
     
    
   
    
     
    
    
     
      // 仓库，数据持久层，主要是和数据库有关的类
     
    
   
    
     
    
    
     
      @Repository
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      // 方法注解，对象的类型由方法返回类型决定
     
    
   
    
     
    
    
     
      // 方法注解必须配合五大类注解进行使用
     
    
   
    
     
    
    
     
      @Bean
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      // 取对象用到的注解
     
    
   
    
     
    
    
     
      // 对象注入有三种方法 1.字段注入 2.构造方法注入（官方推荐） 3.setter注入
     
    
   
    
     
    
    
     
      // spring 框架提供的
     
    
   
    
     
    
    
     
      @Autowired
     
    
   
    
     
    
    
     
      // JDK 提供的
     
    
   
    
     
    
    
     
      @Resource
     
    
   
    
     
    
    
     
      // 配合 @Autowired 一起使用的
     
    
   
    
     
    
    
     
      @Qualifier

spring MVC 相关的：

路由相关的：


  
   
    
     
    
    
     
      // Spring Web 应用程序中最常被用到的注解之一，它是用来注册接口的路由映射的。
     
    
   
    
     
    
    
     
      @RequestMapping
     
    
   
    
     
    
    
     
      // 等效于 @RequestMapping（Method=RequestMethod.POST）
     
    
   
    
     
    
    
     
      @PostMapping
     
    
   
    
     
    
    
     
      // 等效于 @RequestMapping（Method=RequestMethod.GET）
     
    
   
    
     
    
    
     
      @GetMapping

获取前端参数相关的：


  
   
    
     
    
    
     
      // 后端参数映射（重命名） 前端参数和后端参数名不一样时使用
     
    
   
    
     
    
    
     
      // 这个注解默认参数必传，如果非必传 需设置 required = false
     
    
   
    
     
    
    
     
      @RequestMapping
     
    
   
    
     
    
    
     
      // 接受 JSON 对象
     
    
   
    
     
    
    
     
      @RequestBody
     
    
   
    
     
    
    
     
      // 获取路径中的参数 配合 @**Mapping 中的 ${} 一起使用
     
    
   
    
     
    
    
     
      @PathVariable
     
    
   
    
     
    
    
     
      // 上传文件
     
    
   
    
     
    
    
     
      @RequestPart
     
    
   
    
     
    
    
     
      // 获取 cookie
     
    
   
    
     
    
    
     
      @CookieValue
     
    
   
    
     
    
    
     
      // 获取 header
     
    
   
    
     
    
    
     
      @RequestHeader
     
    
   
    
     
    
    
     
      // 获取 session
     
    
   
    
     
    
    
     
      @SessionAttribute

获取非静态页面数据：


  
   
    
     
    
    
     
      // 返回非静态页面，如果没有那一般返回的就是静态页面的 url html，js这类的
     
    
   
    
     
    
    
     
      // @ResponseBody 返回的值如果是字符会转换成 text/html，如果返回的是对象会转换成
     
    
   
    
     
    
    
     
      application/json 返回给前端。
     
    
   
    
     
    
    
     
      @ResponseBody

组合注解


  
   
    
     
    
    
     
      // 等于 @Controller + @ResponseBody
     
    
   
    
     
    
    
     
      @RestController

日志相关的：


  
   
    
     
    
    
     
      // 获取日志对象
     
    
   
    
     
    
    
     
      // 等价于 Logger log = LoggerFactory.getLogger(类.class);
     
    
   
    
     
    
    
     
      @Slf4j

stream流

由于构建索引时，我们需要用到很多的集合，而Java 8 中的 Stream 是对集（Collection）对象功能的增强，它专注于对集合对象进行各种非常便利、高效的聚合操作（aggregate operation），或者大批量数据操作 (bulk data operation)。配合lambda表达式，可以使得代码更精简。


  
   
    
     
    
    
     
      @Override
     
    
   
    
     
    
    
         
      public 
      void 
      run
      (String... args) 
      throws Exception {
     
    
   
    
     
    
    
     
              ToAnalysis.parse(
      "随便分个什么，进行预热，避免优化的时候计算第一次特别慢的时间");
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
              log.info(
      "这里的整个程序的逻辑入口");
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
             
      // 1. 扫描出来所有的 html 文件
     
    
   
    
     
    
    
     
              log.debug(
      "开始扫描目录，找出所有的 html 文件。{}", docRootPath);
     
    
   
    
     
    
    
     
              List<File> htmlFileList = fileScanner.scanFile(docRootPath);
     
    
   
    
     
    
    
     
              log.debug(
      "扫描目录结束，一共得到 {} 个文件。", htmlFileList.size());
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
             
      // 2. 针对每个 html 文件，得到其 标题、URL、正文信息，把这些信息封装成一个对象（文档 Document）
     
    
   
    
     
    
    
             
      File 
      rootFile 
      = 
      new 
      File(docRootPath);
     
    
   
    
     
    
    
     
              List<Document> documentList = htmlFileList
     
    
   
    
     
    
    
     
                      .stream()
     
    
   
    
     
    
    
     
                      .parallel()
     
    
   
    
     
    
    
     
                      .map(file -> 
      new 
      Document(file,urlPrefix,rootFile))
     
    
   
    
     
    
    
     
                      .collect(Collectors.toList());
     
    
   
    
     
    
    
     
              log.debug(
      "构建文档完毕，一共 {} 篇文档", documentList.size());
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
             
      // 3. 进行正排索引的保存
     
    
   
    
     
    
    
     
              indexManager.saveForwardIndexesConcurrent(documentList);
     
    
   
    
     
    
    
     
              log.debug(
      "正排索引保存成功。");
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
             
      // 4. 进行倒排索引的生成核保存
     
    
   
    
     
    
    
     
              indexManager.saveInvertedIndexesConcurrent(documentList);
     
    
   
    
     
    
    
     
              log.debug(
      "倒排索引保存成功。");
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
             
      // 5. 关闭线程池
     
    
   
    
     
    
    
     
              executorService.shutdown();
     
    
   
    
     
    
    
     
          }

在这个代码中，我们需要扫描出根目录中（包含所有子文件夹）所有的 html 文件，并且将这些文件转变成我们需要的 Document 对象以便后续进行分词和权重的计算。stream 像是一个管道。stream（）将集合变成一个流，parallel（）表示使用并行流，使用多核cpu时可以显著的提升速度，map（）表示一个映射，将原来集合的 File 类型映射成我们需要的 Document 类型。collect（）就是一个收集器，将管道中的内容收集到我们需要的集合中。

分词模块

使用了第三方的分词库 ansj

添加如下依赖


  
   
    
     
    
    
     		
      <dependency>
     
    
   
    
     
    
    
     			
      <groupId>org.ansj
      </groupId>
     
    
   
    
     
    
    
     			
      <artifactId>ansj_seg
      </artifactId>
     
    
   
    
     
    
    
     			
      <version>5.1.6
      </version>
     
    
   
    
     
    
    
     		
      </dependency>

导包

import org.ansj.splitWord.analysis.ToAnalysis;

使用 ToAnalysis.parse() 进行分词，保存到集合中


  
   
    
     
    
    
             
      //对正文进行分词，的到一个 contentWordList
     
    
   
    
     
    
    
             
      Result 
      parseResultOfContent 
      = ToAnalysis.parse(content);
     
    
   
    
     
    
    
     
              List<String> contentWordList = parseResultOfContent
     
    
   
    
     
    
    
     
                      .getTerms()
     
    
   
    
     
    
    
     
                      .stream()
     
    
   
    
     
    
    
     
                      .parallel()
     
    
   
    
     
    
    
     
                      .map(Term::getName)
     
    
   
    
     
    
    
     
                      .filter(s -> !ignoredWordSet.contains(s))
     
    
   
    
     
    
    
     
                      .collect(Collectors.toList());

mybaits

MyBatis 可以通过简单的 XML 或注解来配置和映射原始类型。简单来说 MyBatis 是更简单完成程序和数据库交互的工具，也就是更简单的操作和读取数据库工具。mybatis 是介于 java 和数据库之间的一个框架，帮我们干了 jdbc 的很多事：

1. 创建数据库连接池 DataSource
2. 通过 DataSource 获取数据库连接 Connection
3. 编写要执行带 ? 占位符的 SQL 语句
4. 通过 Connection 及 SQL 创建操作命令对象 Statement
5. 替换占位符：指定要替换的数据库字段类型，占位符索引及要替换的值
6. 使用 Statement 执行 SQL 语句
7. 查询操作：返回结果集 ResultSet，更新操作：返回更新的数量
8. 处理结果集
9. 释放资源

我们只需要关注如何写好 **Mapper.xml、sql语句和接口就好了

在 Mapper.xml 中进行如下配置：


  
   
    
     
    
    
     
      <?xml version="1.0" encoding="UTF-8"?>
     
    
   
    
     
    
    
             
      <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
     
    
   
    
     
    
    
     
      <mapper namespace="com.yukuanyan.indexer.mapper.IndexMapper">
     
    
   
    
     
    
    
         
      <insert id="batchInsertForwardIndexes" useGeneratedKeys="true" keyProperty="docId" keyColumn="docid">
     
    
   
    
     
    
    
     
              insert into forward_indexes (title, url, content) values
     
    
   
    
     
    
    
             
      <!-- 一共有多少条记录，得根据用户传入的参数来决定，所以这里采用动态 SQL 特性 -->
     
    
   
    
     
    
    
             
      <foreach collection="list" item="doc" separator=", ">
     
    
   
    
     
    
    
     
                  (#{doc.title}, #{doc.url}, #{doc.content})
     
    
   
    
     
    
    
             
      </foreach>
     
    
   
    
     
    
    
         
      </insert>
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      <!-- 不关心自增 id -->
     
    
   
    
     
    
    
         
      <insert id="batchInsertInvertedIndexes">
     
    
   
    
     
    
    
     
              insert into inverted_indexes (word, docid, weight) values
     
    
   
    
     
    
    
             
      <foreach collection="list" item="record" separator=", ">
     
    
   
    
     
    
    
     
                  (#{record.word}, #{record.docId}, #{record.weight})
     
    
   
    
     
    
    
             
      </foreach>
     
    
   
    
     
    
    
         
      </insert>
     
    
   
    
     
    
    
     
      </mapper>

insert 标签对应插入操作 id 对应 接口中方法的名字 useGeneratedKeys 表示数据库表中的主键使用生成的 id

keyProperty 对应方法中插入对象中的属性 keyColumn 对应表中的列属性.

#{} 和 ${} 都是参数占位符。前者预编译处理。后者字符直接替换。

因为 ${} 是字符直接替换，会有 sql注入问题，因此能不用我们尽量不用。如果一定要使用，要在 controller 层对参数进行安全校验

但是 #{} 也不是万能的，它会将被替换的字符串加上单引号，因此如果参数是sql关键字（sql命令时）例如里面是定义排序的内容（asc / desc）就会出错，我们尽量在被替换内容是数据的时候使用

IndexerMapper 接口


  
   
    
     
    
    
     
      package com.yukuanyan.indexer.mapper;
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      import com.yukuanyan.indexer.
      module.Document;
     
    
   
    
     
    
    
     
      import com.yukuanyan.indexer.
      module.InvertedRecord;
     
    
   
    
     
    
    
     
      import org.apache.ibatis.annotations.Mapper;
     
    
   
    
     
    
    
     
      import org.apache.ibatis.annotations.Param;
     
    
   
    
     
    
    
     
      import org.springframework.stereotype.Repository;
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      import java.util.List;
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      @Mapper
     
    
   
    
     
    
    
     
      @Repository
     
    
   
    
     
    
    
     
      public 
      interface 
      IndexMapper {
     
    
   
    
     
    
    
         
      //批量插入正排索引
     
    
   
    
     
    
    
         
      public 
      void 
      batchInsertForwardIndexes
      (@Param("list") List<Document> documentList);
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
         
      //批量插入倒排索引
     
    
   
    
     
    
    
         
      public 
      void 
      batchInsertInvertedIndexes
      (@Param("list") List<InvertedRecord> recordsList);
     
    
   
    
     
    
    
     
      }

插入操作：首先在需要的类中注入 IndexerMapper 类，再使用里面的方法就可以进行插入了


  
   
    
     
    
    
                 
      Runnable 
      task 
      = 
      new 
      Runnable() {
     
    
   
    
     
    
    
                     
      @Override
     
    
   
    
     
    
    
                     
      public 
      void 
      run
      () {
     
    
   
    
     
    
    
     
                          List<Document> subList = documentList.subList(from, to);
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
                         
      //对 subList 进行批量操作
     
    
   
    
     
    
    
     
                          indexMapper.batchInsertForwardIndexes(subList);
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
                         
      //每次批量插入操作完成之后，latch 的个数就减一
     
    
   
    
     
    
    
     
                          latch.countDown();
     
    
   
    
     
    
    
     
                      }
     
    
   
    
     
    
    
     
                  };

搜索模块

前端

只有两个页面，一个是搜索的主页，一个是显示搜索结果的页面。搜索的主页是一个静态资源，写在index.html内

首页的设计借鉴了青柠起始页的设计，具体细节如下：甲方你请说：仿青柠搜索页模态搜索栏（HTML+CSS+JS）_哔哩哔哩_bilibili

搜索页再主页输入搜索词，点击搜索之后进入。，使用了 thmeleaf 模板技术

后端

搜索结果展示对应后端 controller 下的一个方法。设置路径为 ”/web"

首先对参数进行合法性校验


  
   
    
     
    
    
     
              log.debug(
      "查询: query = {}", query);
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
             
      // 参数的合法性检查 + 处理
     
    
   
    
     
    
    
             
      if (query == 
      null) {
     
    
   
    
     
    
    
     
                  log.debug(
      "query 为 null，重定向到首页");
     
    
   
    
     
    
    
                 
      return 
      "redirect:/";
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
              query = query.trim().toLowerCase();
     
    
   
    
     
    
    
             
      if (query.isEmpty()) {
     
    
   
    
     
    
    
     
                  log.debug(
      "query 为空字符串，重定向到首页");
     
    
   
    
     
    
    
                 
      return 
      "redirect:/";
     
    
   
    
     
    
    
     
              }

对查询字段进行分词


  
   
    
     
    
    
      
     
    
   
    
     
    
    
     
              List<String> queryList = ToAnalysis.parse(query)
     
    
   
    
     
    
    
     
                      .getTerms()
     
    
   
    
     
    
    
     
                      .stream()
     
    
   
    
     
    
    
     
                      .map(Term::getName)
     
    
   
    
     
    
    
     
                      .collect(Collectors.toList());
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
             
      if (queryList.isEmpty()) {
     
    
   
    
     
    
    
     
                  log.debug(
      "query 分词后一个词都没有，重定向到首页");
     
    
   
    
     
    
    
                 
      return 
      "redirect:/";
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
              log.debug(
      "进行查询的词: {}", query);

对所有的查询词进行查询，将所有结果保存到集合中


  
   
    
     
    
    
     
              List<DocumentWithWeight> totalList = 
      new 
      ArrayList<>();
     
    
   
    
     
    
    
             
      for (String s : queryList) {
     
    
   
    
     
    
    
     
                  List<DocumentWithWeight> documentList = mapper.queryWithWeight(s, limit, offset);
     
    
   
    
     
    
    
     
                  totalList.addAll(documentList);
     
    
   
    
     
    
    
     
              }

由于可能有多个查询词，需要对不同的查询词进行权重聚合


  
   
    
     
    
    
     
              Map<Integer, DocumentWithWeight> documentMap = 
      new 
      HashMap<>();
     
    
   
    
     
    
    
             
      for (DocumentWithWeight documentWithWeight : totalList) {
     
    
   
    
     
    
    
                 
      int 
      docId 
      = documentWithWeight.getDocId();
     
    
   
    
     
    
    
                 
      if (documentMap.containsKey(docId)) {
     
    
   
    
     
    
    
                     
      DocumentWithWeight 
      item 
      = documentMap.get(docId);
     
    
   
    
     
    
    
     
                      item.weight += documentWithWeight.weight;
     
    
   
    
     
    
    
                     
      continue;
     
    
   
    
     
    
    
     
                  }
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
                 
      DocumentWithWeight 
      item 
      = 
      new 
      DocumentWithWeight(documentWithWeight);
     
    
   
    
     
    
    
     
                  documentMap.put(docId, item);
     
    
   
    
     
    
    
     
              }

对聚合后的结果进行排序，由于集合没有排序的概念，我们需要转变为线性结构才能进行排序


  
   
    
     
    
    
     
              Collection<DocumentWithWeight> values = documentMap.values();
     
    
   
    
     
    
    
             
      // Collection 没有排序这个概念（只有线性结构才有排序的概念），所以我们需要一个 List
     
    
   
    
     
    
    
     
              List<DocumentWithWeight> list = 
      new 
      ArrayList<>(values);
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
             
      // 按照 weight 的从大到小排序了
     
    
   
    
     
    
    
     
              Collections.sort(list, (item1, item2) -> {
     
    
   
    
     
    
    
                 
      return item2.weight - item1.weight;
     
    
   
    
     
    
    
     
              });

将结果交给模板，由模板去渲染


  
   
    
     
    
    
     
              model.addAttribute(
      "query", query);
     
    
   
    
     
    
    
     
              model.addAttribute(
      "docList", documentList);
     
    
   
    
     
    
    
     
              model.addAttribute(
      "page", page);

转载：https://blog.csdn.net/m0_53653818/article/details/128512573

查看评论

飞道的博客

飞道的博客

个人资料

文章分类

文章存档

阅读排行

评论排行

推荐文章

springboot 搜索引擎项目使用到的技术

构建索引模块

SSM框架

stream流

分词模块

mybaits

搜索模块

前端

后端

* 以上用户言论只代表其个人观点，不代表本网站的观点或立场