Elasticsearch：使用 runtime fields 探索你的数据

2022-12-26 20:51 879人阅读评论(0)

考虑要提取字段的大量日志数据。为数据建立索引非常耗时，并且会占用大量磁盘空间，而你只想探索数据结构而无需预先提交 schema。

你知道你的日志数据包含你要提取的特定字段。在这种情况下，我们要关注 @timestamp 和消息字段。通过使用运行时字段（runtime fields），你可以定义脚本来计算这些字段在搜索时的值。

定义索引字段作为起点

你可以从一个简单的示例开始，将 @timestamp 和 message 字段作为索引字段添加到 my-index-000001 映射中。为了保持灵活性，使用 wildcard 作为消息的字段类型：


  
   
    
     
    
    
     
      PUT /
      my-
      index-
      000001/
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "mappings": {
     
    
   
    
     
    
    
         
      "properties": {
     
    
   
    
     
    
    
           
      "@timestamp": {
     
    
   
    
     
    
    
             
      "format": 
      "strict_date_optional_time||epoch_second",
     
    
   
    
     
    
    
             
      "type": 
      "date"
     
    
   
    
     
    
    
     
            },
     
    
   
    
     
    
    
           
      "message": {
     
    
   
    
     
    
    
             
      "type": 
      "wildcard"
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

在上面，我们有意使用 wildcard 字段来定义 message。这样它非常节省存储空间，并且会提高写入文档的速度。

摄取一些数据

映射完要检索的字段后，将日志数据中的几条记录索引到 Elasticsearch 中。以下请求使用 _bulk API 将原始日志数据索引到 my-index-000001。你可以使用一个小样本来试验运行时字段，而不是索引所有日志数据。

最终文档不是有效的 Apache 日志格式，但我们可以在脚本中考虑到这种情况。


  
   
    
     
    
    
     
      POST 
      /my
      -index
      -
      000001
      /_bulk
      ?refresh
     
    
   
    
     
    
    
     
      {
      "index":{}}
     
    
   
    
     
    
    
     
      {
      "timestamp":
      "2020-04-30T14:30:17-05:00",
      "message":
      "40.135.0.0 - - [30/Apr/2020:14:30:17 -0500] \"GET /images/hm_bg.jpg HTTP/1.0\" 200 24736"}
     
    
   
    
     
    
    
     
      {
      "index":{}}
     
    
   
    
     
    
    
     
      {
      "timestamp":
      "2020-04-30T14:30:53-05:00",
      "message":
      "232.0.0.0 - - [30/Apr/2020:14:30:53 -0500] \"GET /images/hm_bg.jpg HTTP/1.0\" 200 24736"}
     
    
   
    
     
    
    
     
      {
      "index":{}}
     
    
   
    
     
    
    
     
      {
      "timestamp":
      "2020-04-30T14:31:12-05:00",
      "message":
      "26.1.0.0 - - [30/Apr/2020:14:31:12 -0500] \"GET /images/hm_bg.jpg HTTP/1.0\" 200 24736"}
     
    
   
    
     
    
    
     
      {
      "index":{}}
     
    
   
    
     
    
    
     
      {
      "timestamp":
      "2020-04-30T14:31:19-05:00",
      "message":
      "247.37.0.0 - - [30/Apr/2020:14:31:19 -0500] \"GET /french/splash_inet.html HTTP/1.0\" 200 3781"}
     
    
   
    
     
    
    
     
      {
      "index":{}}
     
    
   
    
     
    
    
     
      {
      "timestamp":
      "2020-04-30T14:31:22-05:00",
      "message":
      "247.37.0.0 - - [30/Apr/2020:14:31:22 -0500] \"GET /images/hm_nbg.jpg HTTP/1.0\" 304 0"}
     
    
   
    
     
    
    
     
      {
      "index":{}}
     
    
   
    
     
    
    
     
      {
      "timestamp":
      "2020-04-30T14:31:27-05:00",
      "message":
      "252.0.0.0 - - [30/Apr/2020:14:31:27 -0500] \"GET /images/hm_bg.jpg HTTP/1.0\" 200 24736"}
     
    
   
    
     
    
    
     
      {
      "index":{}}
     
    
   
    
     
    
    
     
      {
      "timestamp":
      "2020-04-30T14:31:28-05:00",
      "message":
      "not a valid apache log"}

此时，你可以查看 Elasticsearch 如何存储你的原始数据。

GET my-index-000001

该映射包含两个字段：@timestamp 和 message。


  
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "my-index-000001": {
     
    
   
    
     
    
    
         
      "aliases": {},
     
    
   
    
     
    
    
         
      "mappings": {
     
    
   
    
     
    
    
           
      "properties": {
     
    
   
    
     
    
    
             
      "@timestamp": {
     
    
   
    
     
    
    
               
      "type": 
      "date",
     
    
   
    
     
    
    
               
      "format": 
      "strict_date_optional_time||epoch_second"
     
    
   
    
     
    
    
     
              },
     
    
   
    
     
    
    
             
      "message": {
     
    
   
    
     
    
    
               
      "type": 
      "wildcard"
     
    
   
    
     
    
    
     
              },
     
    
   
    
     
    
    
             
      "timestamp": {
     
    
   
    
     
    
    
               
      "type": 
      "date"
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          },
     
    
   
    
     
    
    
         
      "settings": {
     
    
   
    
     
    
    
           
      "index": {
     
    
   
    
     
    
    
             
      "routing": {
     
    
   
    
     
    
    
               
      "allocation": {
     
    
   
    
     
    
    
                 
      "include": {
     
    
   
    
     
    
    
                   
      "_tier_preference": 
      "data_content"
     
    
   
    
     
    
    
     
                  }
     
    
   
    
     
    
    
     
                }
     
    
   
    
     
    
    
     
              },
     
    
   
    
     
    
    
             
      "number_of_shards": 
      "1",
     
    
   
    
     
    
    
             
      "provided_name": 
      "my-index-000001",
     
    
   
    
     
    
    
             
      "creation_date": 
      "1672032735783",
     
    
   
    
     
    
    
             
      "number_of_replicas": 
      "1",
     
    
   
    
     
    
    
             
      "uuid": 
      "X1cBJOl3TFKd6v0oeTRlng",
     
    
   
    
     
    
    
             
      "version": {
     
    
   
    
     
    
    
               
      "created": 
      "8050399"
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

使用 grok 模式定义运行时字段

如果要检索包含 clientip 的结果，可以将该字段添加为映射中的运行时字段。以下运行时脚本定义了一个 grok 模式，该模式从文档中的单个文本字段中提取结构化字段。 grok 模式就像支持可以重用的别名表达式的正则表达式。

该脚本匹配 %{COMMONAPACHELOG} 日志模式，该模式了解 Apache 日志的结构。如果模式匹配 (clientip != null)，脚本将发出匹配 IP 地址的值。如果模式不匹配，脚本只会返回字段值而不会崩溃。


  
   
    
     
    
    
     
      PUT my-index-000001/_mappings
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "runtime": {
     
    
   
    
     
    
    
         
      "http.client_ip": {
     
    
   
    
     
    
    
           
      "type": 
      "ip",
     
    
   
    
     
    
    
           
      "script": 
      """
     
    
   
    
     
    
    
     
       String clientip=grok('%{COMMONAPACHELOG}').extract(doc["message"].value)?.clientip;
     
    
   
    
     
    
    
     
       if (clientip != null) emit(clientip); 
     
    
   
    
     
    
    
     
       """
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

我们可以为已经存在的索引动态地添加一个新的字段。特别值得指出的是上面的 ?. 操作符。我们可以参阅链接来进一步阅读。它的意思是对一个 null 对象使用 ?. 操作符会返回 null，而不会使得脚本崩溃。上面的 if 检查，此条件可确保脚本不会崩溃，即使 message 的模式不匹配也是如此。

这样，我们可以针对索引进行搜索，比如：


  
   
    
     
    
    
     
      GET 
      my-
      index-
      000001/_search?filter_path=**.hits
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "query": {
     
    
   
    
     
    
    
         
      "match": {
     
    
   
    
     
    
    
           
      "http.client_ip": 
      "40.135.0.0"
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

上面的 runtime 字段 http.client_ip 在查询时动态生成，并使得我们可以对它进行搜索：


  
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "hits": {
     
    
   
    
     
    
    
         
      "hits": [
     
    
   
    
     
    
    
     
            {
     
    
   
    
     
    
    
             
      "_index": 
      "my-index-000001",
     
    
   
    
     
    
    
             
      "_id": 
      "Zn7rTIUBIjh__4nuBm2T",
     
    
   
    
     
    
    
             
      "_score": 
      1,
     
    
   
    
     
    
    
             
      "_source": {
     
    
   
    
     
    
    
               
      "timestamp": 
      "2020-04-30T14:30:17-05:00",
     
    
   
    
     
    
    
               
      "message": 
      """40.135.0.0 - - [30/Apr/2020:14:30:17 -0500] "GET /images/hm_bg.jpg HTTP/1.0" 200 24736"""
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          ]
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

在上面，我们在 mapping 中定义 runtime fields。在实际的使用中，我们也可以在搜索的时候定义。你可以在搜索请求的上下文中定义相同的运行时字段。运行时定义和脚本与之前在索引映射中定义的完全相同。只需将该定义复制到 runtime_mappings 部分下的搜索请求中，并包含与运行时字段匹配的查询。此查询返回的结果与你在索引映射中为 http.clientip 运行时字段定义搜索查询时返回的结果相同，但仅在此特定搜索的上下文中：


  
   
    
     
    
    
     
      GET 
      my-
      index-
      000001/_search?filter_path=**.hits
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "runtime_mappings": {
     
    
   
    
     
    
    
         
      "http.clientip": {
     
    
   
    
     
    
    
           
      "type": 
      "ip",
     
    
   
    
     
    
    
           
      "script": 
      ""
      "
     
    
   
    
     
    
    
     
       String clientip=grok('%{COMMONAPACHELOG}').extract(doc["message
      "].value)?.clientip;
     
    
   
    
     
    
    
     
       if (clientip != null) emit(clientip);
     
    
   
    
     
    
    
     
       "
      ""
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        },
     
    
   
    
     
    
    
       
      "query": {
     
    
   
    
     
    
    
         
      "match": {
     
    
   
    
     
    
    
           
      "http.clientip": 
      "40.135.0.0"
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        },
     
    
   
    
     
    
    
       
      "fields" : [
      "http.clientip"]
     
    
   
    
     
    
    
     
      }

上面的搜索返回结果：


  
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "hits": {
     
    
   
    
     
    
    
         
      "hits": [
     
    
   
    
     
    
    
     
            {
     
    
   
    
     
    
    
             
      "_index": 
      "my-index-000001",
     
    
   
    
     
    
    
             
      "_id": 
      "Zn7rTIUBIjh__4nuBm2T",
     
    
   
    
     
    
    
             
      "_score": 
      1,
     
    
   
    
     
    
    
             
      "_source": {
     
    
   
    
     
    
    
               
      "timestamp": 
      "2020-04-30T14:30:17-05:00",
     
    
   
    
     
    
    
               
      "message": 
      """40.135.0.0 - - [30/Apr/2020:14:30:17 -0500] "GET /images/hm_bg.jpg HTTP/1.0" 200 24736"""
     
    
   
    
     
    
    
     
              },
     
    
   
    
     
    
    
             
      "fields": {
     
    
   
    
     
    
    
               
      "http.clientip": [
     
    
   
    
     
    
    
                 
      "40.135.0.0"
     
    
   
    
     
    
    
     
                ]
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          ]
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

定义复合运行时字段

你还可以定义复合（composite）运行时字段以从单个脚本发出多个字段。你可以定义一组类型化的子字段并发出值映射。在搜索时，每个子字段在地图中检索与其名称关联的值。这意味着你只需指定一次 grok 模式并可以返回多个值：


  
   
    
     
    
    
     
      PUT my
      -index
      -
      000001
      /_mappings
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "runtime": {
     
    
   
    
     
    
    
         
      "http": {
     
    
   
    
     
    
    
           
      "type": 
      "composite",
     
    
   
    
     
    
    
           
      "script": 
      "emit(grok(\"%{COMMONAPACHELOG}\").extract(doc[\"message\"].value))",
     
    
   
    
     
    
    
           
      "fields": {
     
    
   
    
     
    
    
             
      "clientip": {
     
    
   
    
     
    
    
               
      "type": 
      "ip"
     
    
   
    
     
    
    
     
              },
     
    
   
    
     
    
    
             
      "verb": {
     
    
   
    
     
    
    
               
      "type": 
      "keyword"
     
    
   
    
     
    
    
     
              },
     
    
   
    
     
    
    
             
      "response": {
     
    
   
    
     
    
    
               
      "type": 
      "long"
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

搜索一个特定的 IP 地址

使用 http.clientip 运行时字段，你可以定义一个简单的查询来运行对特定 IP 地址的搜索并返回所有相关字段。


  
   
    
     
    
    
     
      GET 
      my-
      index-
      000001/_search?filter_path=**.hits
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "query": {
     
    
   
    
     
    
    
         
      "match": {
     
    
   
    
     
    
    
           
      "http.clientip": 
      "40.135.0.0"
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        },
     
    
   
    
     
    
    
       
      "fields" : [
      "*"]
     
    
   
    
     
    
    
     
      }

上面的 API 返回以下结果。因为 http 是复合运行时字段，所以响应包括字段下的每个子字段，包括任何与查询匹配的关联值。无需提前构建数据结构，你就可以以有意义的方式搜索和探索数据，以试验并确定要索引的字段。


  
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "hits": {
     
    
   
    
     
    
    
         
      "hits": [
     
    
   
    
     
    
    
     
            {
     
    
   
    
     
    
    
             
      "_index": 
      "my-index-000001",
     
    
   
    
     
    
    
             
      "_id": 
      "Zn7rTIUBIjh__4nuBm2T",
     
    
   
    
     
    
    
             
      "_score": 
      1,
     
    
   
    
     
    
    
             
      "_source": {
     
    
   
    
     
    
    
               
      "timestamp": 
      "2020-04-30T14:30:17-05:00",
     
    
   
    
     
    
    
               
      "message": 
      """40.135.0.0 - - [30/Apr/2020:14:30:17 -0500] "GET /images/hm_bg.jpg HTTP/1.0" 200 24736"""
     
    
   
    
     
    
    
     
              },
     
    
   
    
     
    
    
             
      "fields": {
     
    
   
    
     
    
    
               
      "http.verb": [
     
    
   
    
     
    
    
                 
      "GET"
     
    
   
    
     
    
    
     
                ],
     
    
   
    
     
    
    
               
      "http.clientip": [
     
    
   
    
     
    
    
                 
      "40.135.0.0"
     
    
   
    
     
    
    
     
                ],
     
    
   
    
     
    
    
               
      "http.response": [
     
    
   
    
     
    
    
                 
      200
     
    
   
    
     
    
    
     
                ],
     
    
   
    
     
    
    
               
      "message": [
     
    
   
    
     
    
    
                 
      """40.135.0.0 - - [30/Apr/2020:14:30:17 -0500] "GET /images/hm_bg.jpg HTTP/1.0" 200 24736"""
     
    
   
    
     
    
    
     
                ],
     
    
   
    
     
    
    
               
      "http.client_ip": [
     
    
   
    
     
    
    
                 
      "40.135.0.0"
     
    
   
    
     
    
    
     
                ],
     
    
   
    
     
    
    
               
      "timestamp": [
     
    
   
    
     
    
    
                 
      "2020-04-30T19:30:17.000Z"
     
    
   
    
     
    
    
     
                ]
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          ]
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

另外，还记得脚本中的 if 语句吗？

if (clientip != null) emit(clientip);

如果脚本不包含此条件，则查询将在任何与模式不匹配的分片上失败。通过包含此条件，查询会跳过与 grok 模式不匹配的数据。

搜索特定范围内的文档

你还可以运行对时间戳字段进行操作的范围查询。以下查询返回时间戳大于或等于 2020-04-30T14:31:27-05:00 的任何文档：


  
   
    
     
    
    
     
      GET 
      my-
      index-
      000001/_search?filter_path=**.hits
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "query": {
     
    
   
    
     
    
    
         
      "range": {
     
    
   
    
     
    
    
           
      "timestamp": {
     
    
   
    
     
    
    
             
      "gte": 
      "2020-04-30T14:31:27-05:00"
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

响应包括日志格式不匹配但时间戳在定义范围内的文档。

使用 dissect 模式定义运行时字段

如果你不需要正则表达式的强大功能，你可以使用解剖模式而不是 grok 模式。解剖模式匹配固定的分隔符，但通常比 grok 更快。

你可以使用 dissect 来获得与使用 grok 模式解析 Apache 日志相同的结果。你不匹配日志模式，而是包括要丢弃的字符串部分。特别注意要丢弃的字符串部分将有助于构建成功的解析模式。


  
   
    
     
    
    
     
      PUT 
      my-
      index-
      000001/_mappings
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "runtime": {
     
    
   
    
     
    
    
         
      "http.client.ip": {
     
    
   
    
     
    
    
           
      "type": 
      "ip",
     
    
   
    
     
    
    
           
      "script": 
      ""
      "
     
    
   
    
     
    
    
     
       String clientip=dissect('%{clientip} %{ident} %{auth} [%{@timestamp}] "%{verb} %{request} HTTP/%{httpversion}
      " %{status} %{size}').extract(doc["message
      "].value)?.clientip;
     
    
   
    
     
    
    
     
       if (clientip != null) emit(clientip);
     
    
   
    
     
    
    
     
       "
      ""
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

同样，你可以定义一个解析模式来提取 HTTP 响应代码：


  
   
    
     
    
    
     
      PUT 
      my-
      index-
      000001/_mappings
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "runtime": {
     
    
   
    
     
    
    
         
      "http.responses": {
     
    
   
    
     
    
    
           
      "type": 
      "long",
     
    
   
    
     
    
    
           
      "script": 
      ""
      "
     
    
   
    
     
    
    
     
       String response=dissect('%{clientip} %{ident} %{auth} [%{@timestamp}] "%{verb} %{request} HTTP/%{httpversion}
      " %{response} %{size}').extract(doc["message
      "].value)?.response;
     
    
   
    
     
    
    
     
       if (response != null) emit(Integer.parseInt(response));
     
    
   
    
     
    
    
     
       "
      ""
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

然后，你可以运行查询以使用 http.responses 运行时字段检索特定的 HTTP 响应。使用 _search 请求的 fields 参数来指示你要检索的字段：


  
   
    
     
    
    
     
      GET 
      my-
      index-
      000001/_search?filter_path=**.hits
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "query": {
     
    
   
    
     
    
    
         
      "match": {
     
    
   
    
     
    
    
           
      "http.responses": 
      "304"
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        },
     
    
   
    
     
    
    
       
      "fields" : [
      "http.client_ip",
      "timestamp",
      "http.verb"]
     
    
   
    
     
    
    
     
      }

响应包括单个文档，其中 HTTP 响应为 304：


  
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "hits": {
     
    
   
    
     
    
    
         
      "hits": [
     
    
   
    
     
    
    
     
            {
     
    
   
    
     
    
    
             
      "_index": 
      "my-index-000001",
     
    
   
    
     
    
    
             
      "_id": 
      "an7rTIUBIjh__4nuBm2T",
     
    
   
    
     
    
    
             
      "_score": 
      1,
     
    
   
    
     
    
    
             
      "_source": {
     
    
   
    
     
    
    
               
      "timestamp": 
      "2020-04-30T14:31:22-05:00",
     
    
   
    
     
    
    
               
      "message": 
      """247.37.0.0 - - [30/Apr/2020:14:31:22 -0500] "GET /images/hm_nbg.jpg HTTP/1.0" 304 0"""
     
    
   
    
     
    
    
     
              },
     
    
   
    
     
    
    
             
      "fields": {
     
    
   
    
     
    
    
               
      "http.verb": [
     
    
   
    
     
    
    
                 
      "GET"
     
    
   
    
     
    
    
     
                ],
     
    
   
    
     
    
    
               
      "http.client_ip": [
     
    
   
    
     
    
    
                 
      "247.37.0.0"
     
    
   
    
     
    
    
     
                ],
     
    
   
    
     
    
    
               
      "timestamp": [
     
    
   
    
     
    
    
                 
      "2020-04-30T19:31:22.000Z"
     
    
   
    
     
    
    
     
                ]
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          ]
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

转载：https://blog.csdn.net/UbuntuTouch/article/details/128443094

查看评论

飞道的博客

飞道的博客

个人资料

文章分类

文章存档

阅读排行

评论排行

推荐文章