Elasticsearch：如何调试集群状态 - 定位错误信息_飞道的博客

Elasticsearch：如何调试集群状态 - 定位错误信息

2020-10-13 10:43 589人阅读评论(0)

针对 Elasticsearch 集群时，我们可以通过如下的 _cluster/health 命令来查询集群的状态：

GET _cluster/health

在正常的情况下，它会显示健康的状态，也就是绿色。关于监控的颜色的描述，我们可以参考我之前的文章 “Elasticsearch中的一些重要概念:cluster, node, index, document, shards及replica”。但是当我们的集群有没有被分配的 shard，或者数据有缺失，那么它的状态就会显示为黄色或者红色。

红色：集群中未分配至少一个主分片
黄色：已分配所有主副本，但未分配至少一个副本
绿色：分配所有分片

上面的命令返回的结果如下：


  
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "cluster_name" : 
      "my_cluster",
     
    
   
    
     
    
    
       
      "status" : 
      "red",
     
    
   
    
     
    
    
       
      "timed_out" : 
      false,
     
    
   
    
     
    
    
       
      "number_of_nodes" : 
      1,
     
    
   
    
     
    
    
       
      "number_of_data_nodes" : 
      1,
     
    
   
    
     
    
    
       
      "active_primary_shards" : 
      104,
     
    
   
    
     
    
    
       
      "active_shards" : 
      104,
     
    
   
    
     
    
    
       
      "relocating_shards" : 
      0,
     
    
   
    
     
    
    
       
      "initializing_shards" : 
      0,
     
    
   
    
     
    
    
       
      "unassigned_shards" : 
      60,
     
    
   
    
     
    
    
       
      "delayed_unassigned_shards" : 
      0,
     
    
   
    
     
    
    
       
      "number_of_pending_tasks" : 
      0,
     
    
   
    
     
    
    
       
      "number_of_in_flight_fetch" : 
      0,
     
    
   
    
     
    
    
       
      "task_max_waiting_in_queue_millis" : 
      0,
     
    
   
    
     
    
    
       
      "active_shards_percent_as_number" : 
      63.41463414634146
     
    
   
    
     
    
    
     
      }

上面显示，我们当前的集群状态为红色，表示数据有丢失的情况。那么我们怎么查出来是那个 shard 和哪个索引出问题了呢？

我们可以使用如下的命令来对集群进行查看：

GET _cluster/health?level=indices

上面的命令可以让我们定位到到底是哪一个或者哪一些索引有问题。上面的命令显示的结果为：

从上面我们可以看出来 restored_logs_4 这个索引是有问题的。它显示的状态为 red，也即是红色。

我们也可以对 shard 进行查询：

GET _cluster/health?level=shards

上面的命令显示的结果为：

上面的命令显示 restored_logs_4 这个索引的 shard 0 的状态是0。这种情况发生在这个 shard 从未被分配过，或者曾经被分配过，但是整个 node 可能由于某种原因而造成这个 shard 的丢失。

我们甚至直接使用如下的方法来得到这个索引的所有情况：

GET _cluster/health/restored_logs_4?level=shards

上面显示的结果为：


  
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "cluster_name" : 
      "my_cluster",
     
    
   
    
     
    
    
       
      "status" : 
      "red",
     
    
   
    
     
    
    
       
      "timed_out" : 
      false,
     
    
   
    
     
    
    
       
      "number_of_nodes" : 
      1,
     
    
   
    
     
    
    
       
      "number_of_data_nodes" : 
      1,
     
    
   
    
     
    
    
       
      "active_primary_shards" : 
      0,
     
    
   
    
     
    
    
       
      "active_shards" : 
      0,
     
    
   
    
     
    
    
       
      "relocating_shards" : 
      0,
     
    
   
    
     
    
    
       
      "initializing_shards" : 
      0,
     
    
   
    
     
    
    
       
      "unassigned_shards" : 
      2,
     
    
   
    
     
    
    
       
      "delayed_unassigned_shards" : 
      0,
     
    
   
    
     
    
    
       
      "number_of_pending_tasks" : 
      0,
     
    
   
    
     
    
    
       
      "number_of_in_flight_fetch" : 
      0,
     
    
   
    
     
    
    
       
      "task_max_waiting_in_queue_millis" : 
      0,
     
    
   
    
     
    
    
       
      "active_shards_percent_as_number" : 
      63.41463414634146,
     
    
   
    
     
    
    
       
      "indices" : {
     
    
   
    
     
    
    
         
      "restored_logs_4" : {
     
    
   
    
     
    
    
           
      "status" : 
      "red",
     
    
   
    
     
    
    
           
      "number_of_shards" : 
      1,
     
    
   
    
     
    
    
           
      "number_of_replicas" : 
      1,
     
    
   
    
     
    
    
           
      "active_primary_shards" : 
      0,
     
    
   
    
     
    
    
           
      "active_shards" : 
      0,
     
    
   
    
     
    
    
           
      "relocating_shards" : 
      0,
     
    
   
    
     
    
    
           
      "initializing_shards" : 
      0,
     
    
   
    
     
    
    
           
      "unassigned_shards" : 
      2,
     
    
   
    
     
    
    
           
      "shards" : {
     
    
   
    
     
    
    
             
      "0" : {
     
    
   
    
     
    
    
               
      "status" : 
      "red",
     
    
   
    
     
    
    
               
      "primary_active" : 
      false,
     
    
   
    
     
    
    
               
      "active_shards" : 
      0,
     
    
   
    
     
    
    
               
      "relocating_shards" : 
      0,
     
    
   
    
     
    
    
               
      "initializing_shards" : 
      0,
     
    
   
    
     
    
    
               
      "unassigned_shards" : 
      2
     
    
   
    
     
    
    
     
              }
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        }
     
    
   
    
     
    
    
     
      }

为了能够更进一步查出来到底是什么原因造成的，我们可以如下的命令来进行查询：

GET _cluster/allocation/explain

在实际的使用中，我们需要配置一些参数来得到某个具体索引的分配情况，比如：


  
   
    
     
    
    
     
      GET _cluster/allocation/
      explain
     
    
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "index": 
      "restored_logs_4",
     
    
   
    
     
    
    
       
      "shard": 
      0,
     
    
   
    
     
    
    
       
      "primary": 
      true
     
    
   
    
     
    
    
     
      }

上面的命令显示的结果为：


  
   
    
     
    
    
     
      {
     
    
   
    
     
    
    
       
      "index" : 
      "restored_logs_4",
     
    
   
    
     
    
    
       
      "shard" : 
      0,
     
    
   
    
     
    
    
       
      "primary" : 
      true,
     
    
   
    
     
    
    
       
      "current_state" : 
      "unassigned",
     
    
   
    
     
    
    
       
      "unassigned_info" : {
     
    
   
    
     
    
    
         
      "reason" : 
      "CLUSTER_RECOVERED",
     
    
   
    
     
    
    
         
      "at" : 
      "2020-10-05T08:08:54.241Z",
     
    
   
    
     
    
    
         
      "last_allocation_status" : 
      "no_valid_shard_copy"
     
    
   
    
     
    
    
     
        },
     
    
   
    
     
    
    
       
      "can_allocate" : 
      "no_valid_shard_copy",
     
    
   
    
     
    
    
       
      "allocate_explanation" : 
      "cannot allocate because a previous copy of the primary shard existed but can no longer be found on the nodes in the cluster",
     
    
   
    
     
    
    
       
      "node_allocation_decisions" : [
     
    
   
    
     
    
    
     
          {
     
    
   
    
     
    
    
           
      "node_id" : 
      "Ohi9yhffThGZ5X8gq4AXLw",
     
    
   
    
     
    
    
           
      "node_name" : 
      "node1",
     
    
   
    
     
    
    
           
      "transport_address" : 
      "127.0.0.1:9300",
     
    
   
    
     
    
    
           
      "node_attributes" : {
     
    
   
    
     
    
    
             
      "ml.machine_memory" : 
      "34359738368",
     
    
   
    
     
    
    
             
      "xpack.installed" : 
      "true",
     
    
   
    
     
    
    
             
      "transform.node" : 
      "true",
     
    
   
    
     
    
    
             
      "ml.max_open_jobs" : 
      "20",
     
    
   
    
     
    
    
             
      "my_rack" : 
      "rack1"
     
    
   
    
     
    
    
     
            },
     
    
   
    
     
    
    
           
      "node_decision" : 
      "no",
     
    
   
    
     
    
    
           
      "store" : {
     
    
   
    
     
    
    
             
      "found" : 
      false
     
    
   
    
     
    
    
     
            }
     
    
   
    
     
    
    
     
          }
     
    
   
    
     
    
    
     
        ]
     
    
   
    
     
    
    
     
      }

从上面的描述中，我们可以看到为啥我们的 shard 是分配不成功的。

转载：https://blog.csdn.net/UbuntuTouch/article/details/108973356

查看评论

飞道的博客

飞道的博客

个人资料

文章分类

文章存档

阅读排行

评论排行

推荐文章

Elasticsearch：如何调试集群状态 - 定位错误信息

* 以上用户言论只代表其个人观点，不代表本网站的观点或立场