
MXNet的Faster R-CNN(基于区域提议网络的实时目标检测)《4》

        这篇主要了解语义分割(semantic segmentation),语义分割是分类中的一个核心知识点,而且这些语义区域的标注和预测都是像素级的。在语义分割中有两个很相似的重要问题,需要注意下:
图像分割(image segmentation):将图像分割成若干组成区域,这类问题的方法通常利用图像中像素之间的相关性,这个可以去了解下泛洪填充,这个在训练的时候不需要有关图像像素的标签信息,当然在预测的时候也就没法保证分割出来的区域是我们想要的,比如可能出现将一只狗分割成两部分,毛色一样的一部分,黑色脑袋是另一部分。

实例分割(instance segmentation)又叫同时检测并分割(simultaneous detection and segmentation),它研究如何识别图像中各个目标实例的像素级区域。跟语义分割不同的是,实例分割不仅需要区分语义,还要区分不同的目标实例。比如图像中有两只狗,实例分割需要区分像素属于这两只狗中的哪一只。

Pascal VOC2012语义分割数据集

对于这个数据集,我们在MXNet的Faster R-CNN(基于区域提议网络的实时目标检测)《1》


  1. import d2lzh as d2l
  2. n = 5
  3. train_features, train_labels = d2l.read_voc_images()
  4. imgs = train_features[ 0:n]+train_labels[ 0:n]
  5. d2l.show_images(imgs, 2, n)
  6. d2l.plt.show()


  1. def show_images( imgs, num_rows, num_cols, scale=2):
  2. """Plot a list of images."""
  3. figsize = (num_cols * scale, num_rows * scale)
  4. _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
  5. for i in range(num_rows):
  6. for j in range(num_cols):
  7. axes[i][j].imshow(imgs[i * num_cols + j].asnumpy())
  8. axes[i][j].axes.get_xaxis().set_visible( False)
  9. axes[i][j].axes.get_yaxis().set_visible( False)
  10. return axes

axes[i][j].imshow(imgs[i * num_cols + j].asnumpy())这里是显示几行几列在imgs中的位置,比如这里的n=5,也就是2行5列,我们拿第2行第3列举例,显示的就是imgs[1 * 5 + 2]即imgs[7],也就是说第2行的第3列就是图片索引为7的数据。





  1. VOC_CLASSES = [ 'background', 'aeroplane', 'bicycle', 'bird', 'boat',
  2.                 'bottle', 'bus', 'car', 'cat', 'chair', 'cow',
  3.                 'diningtable', 'dog', 'horse', 'motorbike', 'person',
  4.                 'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor']
  5. VOC_COLORMAP = [[ 0, 0, 0], [ 128, 0, 0], [ 0, 128, 0], [ 128, 128, 0],
  6.                 [ 0, 0, 128], [ 128, 0, 128], [ 0, 128, 128], [ 128, 128, 128],
  7.                 [ 64, 0, 0], [ 192, 0, 0], [ 64, 128, 0], [ 192, 128, 0],
  8.                 [ 64, 0, 128], [ 192, 0, 128], [ 64, 128, 128], [ 192, 128, 128],
  9.                 [ 0, 64, 0], [ 128, 64, 0], [ 0, 192, 0], [ 128, 192, 0],
  10.                 [ 0, 64, 128]]


  1. train_features, train_labels = d2l.read_voc_images()
  2. colormap2label = nd.zeros( 256** 3) # 0~255个像素值,RGB三种颜色
  4. #RGB颜色值对应类别索引值
  5. for i, colormap in enumerate(VOC_COLORMAP):
  6. #print(colormap)
  7. colormap2label[(colormap[ 0]* 256 + colormap[ 1])* 256 + colormap[ 2]] = i



  1. #标签颜色值d2lzh包已有
  2. def voc_label_indices( colormap,colormap2label):
  3. colormap=colormap.astype( 'int32')
  4. idx=((colormap[:,:, 0]* 256 + colormap[:,:, 1]) * 256 + colormap[:,:, 2])
  5. return colormap2label[idx]
  6. #2007_000241
  7. y=voc_label_indices(train_labels[ 6],colormap2label)
  8. print(y[ 160: 180, 280: 300])
  9. print(d2l.VOC_CLASSES[ 4])
  10. '''
  11. [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4.]
  12. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4.]
  13. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4.]
  14. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4.]
  15. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4.]
  16. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  17. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  18. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  19. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  20. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  21. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  22. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  23. [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  24. [0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  25. [0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  26. [0. 0. 0. 0. 0. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  27. [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  28. [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  29. [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]
  30. [4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4.]]
  31. <NDArray 20x20 @cpu(0)>
  32. boat
  33. '''




  1. imgs = []
  2. n = 5
  3. for _ in range(n):
  4. imgs += d2l.voc_rand_crop(train_features[ 0], train_labels[ 0], 200, 300)
  5. # [::2]表示步长为2,即每隔一个取数;[1::2]表示从第二个开始,每隔一个的取数
  6. # d2l.show_images(imgs,2,n)#跟这个的区别试着对比下
  7. d2l.show_images(imgs[:: 2]+imgs[ 1:: 2], 2, n)
  8. d2l.plt.show()



  1. crop_size=( 500, 480)
  2. voc_dir= "../data/VOCdevkit/VOC2012"
  3. voc_train=d2l.VOCSegDataset( True,crop_size,voc_dir,colormap2label)
  4. voc_test=d2l.VOCSegDataset( False,crop_size,voc_dir,colormap2label)
  5. '''
  6. read 14 examples
  7. read 16 examples
  8. '''



  1. import matplotlib.pyplot as plt
  2. imgarr=voc_train.__getitem__( 0)[ 0].transpose(( 1, 2, 0)).asnumpy()
  3. print(imgarr)
  4. plt.imshow(imgarr)
  5. plt.colorbar()
  6. plt.show()


  1. import matplotlib.image as image
  2. img=image.imread( 'hi.jpg')
  3. img. min()
  4. img. max()


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


  1. batch_size= 64
  2. train_iter=gdata.DataLoader(voc_train,batch_size,shuffle= True,last_batch= 'discard')
  3. test_iter=gdata.DataLoader(voc_test,batch_size,shuffle= True,last_batch= 'discard')
  4. for X,y in train_iter:
  5. print(X.shape)
  6. print(y.shape)
  7. break
  8. '''
  9. (64, 3, 300, 480)
  10. (64, 300, 480)
  11. '''


  1. for i, colormap in enumerate(VOC_COLORMAP):
  2. colormap2label[(colormap[ 0]* 256 + colormap[ 1])* 256 + colormap[ 2]] = i
  3. crop_size=( 300, 480)
  4. voc_dir= "../data/VOCdevkit/VOC2012"
  5. voc_train=d2l.VOCSegDataset( True,crop_size,voc_dir,colormap2label)
  6. voc_test=d2l.VOCSegDataset( False,crop_size,voc_dir,colormap2label)
  7. print(voc_train.__getitem__( 0))
  8. imgarr=voc_train.__getitem__( 0)[ 1]
  9. print(imgarr. max()) #20
  10. print(imgarr[ 100: 105, 150: 160])
  11. '''
  12. [[ 0. 0. 20. 20. 20. 20. 20. 20. 20. 20.]
  13. [ 0. 0. 20. 20. 20. 20. 20. 20. 20. 20.]
  14. [ 0. 0. 20. 20. 20. 20. 20. 20. 20. 20.]
  15. [ 0. 0. 20. 20. 20. 20. 20. 20. 20. 20.]
  16. [ 0. 0. 20. 20. 20. 20. 20. 20. 20. 20.]]
  17. <NDArray 5x10 @cpu(0)>
  18. '''


  1. class VOCSegDataset(gdata.Dataset):
  2. """The Pascal VOC2012 Dataset."""
  3. def __init__( self, is_train, crop_size, voc_dir, colormap2label):
  4. self.rgb_mean = nd.array([ 0.485, 0.456, 0.406])
  5. self.rgb_std = nd.array([ 0.229, 0.224, 0.225])
  6. self.crop_size = crop_size
  7. data, labels = read_voc_images(root=voc_dir, is_train=is_train)
  8. self.data = [self.normalize_image(im) for im in self. filter(data)]
  9. self.labels = self. filter(labels)
  10. self.colormap2label = colormap2label
  11. print( 'read ' + str( len(self.data)) + ' examples')
  12. def normalize_image( self, data):
  13. return (data.astype( 'float32') / 255 - self.rgb_mean) / self.rgb_std
  14. def filter( self, images):
  15. return [im for im in images if (
  16. im.shape[ 0] >= self.crop_size[ 0] and
  17. im.shape[ 1] >= self.crop_size[ 1])]
  18. def __getitem__( self, idx):
  19. data, labels = voc_rand_crop(self.data[idx], self.labels[idx],*self.crop_size)
  20. return (data.transpose(( 2, 0, 1)),voc_label_indices(labels, self.colormap2label))
  21. def __len__( self):
  22. return len(self.data)

