Elasticsearch：避免不必要的脚本 - scripting_小言_互联网的博客

Elasticsearch：避免不必要的脚本 - scripting

2020-08-22 17:41 919人阅读评论(0)

Painless 脚本为我们的搜索带来了很多的方便和灵活性，但是在很多的实践中，我们需要认真地思考这个脚本是不是最有效的。特别是，当我们在 query 使用脚本时，我们需要特别注意。这是因为在搜索时，需要针对每个文档进行计算，当我们的文档的数量很大时，那么这个计算量将会是非常大，从而影响搜索的效率。

比如，我们创建如下的一个文档：


  
   
    
     
    
    
     
      PUT twitter
      /_doc/
      1
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "user" : 
      "双榆树-张三",
     
    
   
    
     
    
    
       
      "message" : 
      "今儿天气不错啊，出去转转去",
     
    
   
    
     
    
    
       
      "uid" : 
      2,
     
    
   
    
     
    
    
       
      "age" : 
      20,
     
    
   
    
     
    
    
       
      "city" : 
      "北京",
     
    
   
    
     
    
    
       
      "province" : 
      "北京",
     
    
   
    
     
    
    
       
      "country" : 
      "中国",
     
    
   
    
     
    
    
       
      "address" : 
      "中国北京市海淀区",
     
    
   
    
     
    
    
       
      "location" : {
     
    
   
    
     
    
    
         
      "lat" : 
      "39.970718",
     
    
   
    
     
    
    
         
      "lon" : 
      "116.325747"
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

假如我们想搜索 message 字段的长度大于 10 的所有文档，我们可以通过如下的方法来获得。


  
   
    
     
    
    
     
      GET twitter/_search
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "query": {
     
    
   
    
     
    
    
         
      "script": {
     
    
   
    
     
    
    
           
      "script": {
     
    
   
    
     
    
    
             
      "lang": 
      "painless",
     
    
   
    
     
    
    
             
      "source": 
      "doc['message.keyword'].value.length() > params.length",
     
    
   
    
     
    
    
             
      "params": {
     
    
   
    
     
    
    
               
      "length": 
      10
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

在上面我们使用脚本来计算 message 字段的长度，并返回所有 message 字段长度超过 10 的文档。

上面的搜索咋一看，没有任何毛病，但是假设我们的文档数目有很多（比如几万个数据），那么上面的这个在搜索时的计算量将是非常大的，从而会影响搜索的效率。那么我们有什么好的方法来解决这个问题呢？

我们可以把这个计算放到 index 时候，也就是在建立索引的时候。比如我们可以这么做：


  
   
    
     
    
    
     
      DELETE twitter
     
    
   
    
     
    
    
      
     
    
   
    
     
    
    
     
      PUT _ingest/pipeline/calculate_length
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "description": 
      "Calculate the length of message",
     
    
   
    
     
    
    
       
      "processors": [
     
    
   
    
     
    
    
     
          {
     
    
   
    
     
    
    
           
      "script": {
     
    
   
    
     
    
    
             
      "source": 
      """
     
    
   
    
     
    
    
     
       ctx.length = ctx.message.length()
     
    
   
    
     
    
    
     
       """
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        ]
     
    
   
    
     
    
    
     
      }

在上面，我们先删除之前创建的 twitter 索引，然后创建一个计算 message 字段长度的一个脚本 processor。在导入一个文档时，我们使用如下的方法：


  
   
    
     
    
    
     
      PUT twitter
      /_doc/
      1?pipeline=calculate_length
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "user": 
      "双榆树-张三",
     
    
   
    
     
    
    
       
      "message": 
      "今儿天气不错啊，出去转转去",
     
    
   
    
     
    
    
       
      "uid": 
      2,
     
    
   
    
     
    
    
       
      "age": 
      20,
     
    
   
    
     
    
    
       
      "city": 
      "北京",
     
    
   
    
     
    
    
       
      "province": 
      "北京",
     
    
   
    
     
    
    
       
      "country": 
      "中国",
     
    
   
    
     
    
    
       
      "address": 
      "中国北京市海淀区",
     
    
   
    
     
    
    
       
      "location": {
     
    
   
    
     
    
    
         
      "lat": 
      "39.970718",
     
    
   
    
     
    
    
         
      "lon": 
      "116.325747"
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

由于我们使用了 calculate_length 这个 pipeline，那么它将会为我们创建一个新的叫做 length 的字段。我们通过如下的方法来进行查询：

GET twitter/_search

上面显示的结果为：


  
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "took" : 
      0,
     
    
   
    
     
    
    
       
      "timed_out" : 
      false,
     
    
   
    
     
    
    
       
      "_shards" : {
     
    
   
    
     
    
    
         
      "total" : 
      1,
     
    
   
    
     
    
    
         
      "successful" : 
      1,
     
    
   
    
     
    
    
         
      "skipped" : 
      0,
     
    
   
    
     
    
    
         
      "failed" : 
      0
     
    
   
    
     
    
    
     
        },
     
    
   
    
     
    
    
       
      "hits" : {
     
    
   
    
     
    
    
         
      "total" : {
     
    
   
    
     
    
    
           
      "value" : 
      1,
     
    
   
    
     
    
    
           
      "relation" : 
      "eq"
     
    
   
    
     
    
    
     
          },
     
    
   
    
     
    
    
         
      "max_score" : 
      1.0,
     
    
   
    
     
    
    
         
      "hits" : [
     
    
   
    
     
    
    
     
            {
     
    
   
    
     
    
    
             
      "_index" : 
      "twitter",
     
    
   
    
     
    
    
             
      "_type" : 
      "_doc",
     
    
   
    
     
    
    
             
      "_id" : 
      "1",
     
    
   
    
     
    
    
             
      "_score" : 
      1.0,
     
    
   
    
     
    
    
             
      "_source" : {
     
    
   
    
     
    
    
               
      "country" : 
      "中国",
     
    
   
    
     
    
    
               
      "address" : 
      "中国北京市海淀区",
     
    
   
    
     
    
    
               
      "city" : 
      "北京",
     
    
   
    
     
    
    
               
      "length" : 
      13,
     
    
   
    
     
    
    
               
      "message" : 
      "今儿天气不错啊，出去转转去",
     
    
   
    
     
    
    
               
      "uid" : 
      2,
     
    
   
    
     
    
    
               
      "province" : 
      "北京",
     
    
   
    
     
    
    
               
      "location" : {
     
    
   
    
     
    
    
                 
      "lon" : 
      "116.325747",
     
    
   
    
     
    
    
                 
      "lat" : 
      "39.970718"
     
    
   
    
     
    
    
     
                },
     
    
   
    
     
    
    
               
      "user" : 
      "双榆树-张三",
     
    
   
    
     
    
    
               
      "age" : 
      20
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          ]
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

我们可以看到一个新增加的 length 字段。上面显示的值为 13 。那么有了这个字段，我们就可以通过如下的方法来进行查询了：


  
   
    
     
    
    
     
      GET twitter/_search
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "query": {
     
    
   
    
     
    
    
         
      "range": {
     
    
   
    
     
    
    
           
      "length": {
     
    
   
    
     
    
    
             
      "gte": 
      10
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

在上面，我们使用了新增加的 length 字段来查询我们的文档。这样可以大大地提供搜索的效率。

转载：https://blog.csdn.net/UbuntuTouch/article/details/108061204

查看评论

小言_互联网的博客

小言_互联网的博客

个人资料

文章分类

文章存档

阅读排行

评论排行

推荐文章

Elasticsearch：避免不必要的脚本 - scripting

* 以上用户言论只代表其个人观点，不代表本网站的观点或立场