Fork me on GitHub

pytorch中检测分割模型中图像预处理探究

- 主要探究检测分割模型数据增强操作有哪些?

- 检测分割模型图像输入大小?检测模型Faster rcnn输入较大800+;而ssd则有300,512之分;分割模型一般deeplab使用321,513,769等;输入大小对结果敏感吗?

- 检测分割模型的batch-szie都比较小;这对显存消耗很大,和输入大小的关系?本身分割模型deeplab系列就有空洞卷积,显存消耗就大了;

 

检测模型

- 统计数据集RGB通道的均值;减均值;

- 尺度缩放,这样进行的缩放不会造成图像形变;

- 像素填充32倍整数;

- 另外检测模型中的数据增强方法?一般采用什么,主要是针对ann(bounding box不好操作吧!)

其中还可以采取第三步,将图片的宽和高扩展为32的整倍数,正如在Retinanet使用的。下面是一个简单的Pytorch数据预处理模块:

class Resizer():
    def __call__(self, sample, targetSize=608, maxSize=1024, pad_N=32):
        image, anns = sample['img'], sample['ann']
        rows, cols = image.shape[:2]
        
        smaller_size, larger_size = min(rows, cols), max(rows, cols)
        scale = targetSize / smaller_size
        if larger_size * scale > maxSize:
            scale = maxSize / larger_size
        image = skimage.transform.resize(image, (int(round(rows*scale)), 
                                                 int(round(cols*scale))), 
                                         mode='constant')
        rows, cols, cns = image.shape[:3]
        
        pad_w, pad_h = (pad_N - cols % pad_N), (pad_N - rows % pad_N)
        new_image = np.zeros((rows + pad_h, cols + pad_w, cns)).astype(np.float32)
        new_image[:rows, :cols, :] = image.astype(np.float32)
        
        anns[:, :4] *= scale
        return {'img': torch.from_numpy(new_image), 
                'ann':torch.from_numpy(anns),
                'scale':scale}

分割模型

 - 分割模型对数据增强的处理!

- https://github.com/hualin95/Deeplab-v3plus/blob/master/datasets/cityscapes_Dataset.py

def __getitem__(self, item):
        id = self.items[item]
        filename = id.split("train_")[-1].split("val_")[-1]
        image_filepath = os.path.join(self.image_filepath, id.split("_")[0], id.split("_")[1])
        image_filename = filename + "_leftImg8bit.png"
        image_path = os.path.join(image_filepath, image_filename)
        image = Image.open(image_path).convert("RGB")

        if self.split == "test":
            return self._test_transform(image), filename

        gt_filepath = os.path.join(self.gt_filepath, id.split("_")[0], id.split("_")[1])
        gt_filename = filename + "_gtFine_labelIds.png"
        gt_image_path = os.path.join(gt_filepath, gt_filename)
        gt_image = Image.open(gt_image_path)

        if self.split == "train" or self.split == "trainval":
            image, gt_image = self._train_sync_transform(image, gt_image)
        else:
            image, gt_image = self._val_sync_transform(image, gt_image,filename)
        # print(filename)
        return image, gt_image, filename

    def _train_sync_transform(self, img, mask):
        '''
        :param image:  PIL input image
        :param gt_image: PIL input gt_image
        :return:
        '''
        # random mirror
        if random.random() < 0.5:
            img = img.transpose(Image.FLIP_LEFT_RIGHT)
            mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
        crop_size = self.crop_size
        # random scale (short edge)
        short_size = random.randint(int(self.base_size * 0.5), int(self.base_size * 2.0))
        w, h = img.size
        if h > w:
            ow = short_size
            oh = int(1.0 * h * ow / w)
        else:
            oh = short_size
            ow = int(1.0 * w * oh / h)
        img = img.resize((ow, oh), Image.BILINEAR)
        mask = mask.resize((ow, oh), Image.NEAREST)
        # pad crop
        if short_size < crop_size:
            padh = crop_size - oh if oh < crop_size else 0
            padw = crop_size - ow if ow < crop_size else 0
            img = ImageOps.expand(img, border=(0, 0, padw, padh), fill=0)
            mask = ImageOps.expand(mask, border=(0, 0, padw, padh), fill=0)
        # random crop crop_size
        w, h = img.size
        x1 = random.randint(0, w - crop_size)
        y1 = random.randint(0, h - crop_size)
        img = img.crop((x1, y1, x1 + crop_size, y1 + crop_size))
        mask = mask.crop((x1, y1, x1 + crop_size, y1 + crop_size))
        # gaussian blur as in PSP
        if random.random() < 0.5:
            img = img.filter(ImageFilter.GaussianBlur(
                radius=random.random()))
        # final transform
        img, mask = self._img_transform(img), self._mask_transform(mask)
        return img, mask

    def _val_sync_transform(self, img, mask,filename=None):
        outsize = self.crop_size
        short_size = outsize
        w, h = img.size
        if w > h:
            oh = short_size
            ow = int(1.0 * w * oh / h)
        else:
            ow = short_size
            oh = int(1.0 * h * ow / w)
        img = img.resize((ow, oh), Image.BILINEAR)
        mask = mask.resize((ow, oh), Image.NEAREST)
        # center crop
        w, h = img.size
        x1 = int(round((w - outsize) / 2.))
        y1 = int(round((h - outsize) / 2.))
        img = img.crop((x1, y1, x1 + outsize, y1 + outsize))
        mask = mask.crop((x1, y1, x1 + outsize, y1 + outsize))
        # final transform
        img, mask = self._img_transform(img), self._mask_transform(mask,filename)
        return img, mask

    def _test_transform(self, img):
        outsize = self.crop_size
        short_size = outsize
        w, h = img.size
        if w > h:
            oh = short_size
            ow = int(1.0 * w * oh / h)
        else:
            ow = short_size
            oh = int(1.0 * h * ow / w)
        img = img.resize((ow, oh), Image.BILINEAR)
        # center crop
        w, h = img.size
        x1 = int(round((w - outsize) / 2.))
        y1 = int(round((h - outsize) / 2.))
        img = img.crop((x1, y1, x1 + outsize, y1 + outsize))
        # final transform
        img = self._img_transform(img)
        return img

    def _img_transform(self, image):
        image_transforms = ttransforms.Compose([
            ttransforms.ToTensor(),
            ttransforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])
        image = image_transforms(image)
        return image

    def _mask_transform(self, gt_image,filename=None):
        target = self._class_to_index(np.array(gt_image).astype('int32'),filename)
        target = torch.from_numpy(target)

        return target

    def __len__(self):
        return len(self.items)

- 读取图使用PIL,因此需要转为RGB通道顺序;

- 需要对img,mask都进行数据增强操作;

- https://github.com/kazuto1011/deeplab-pytorch/blob/master/libs/datasets/cocostuff.py

    def __getitem__(self, index):
        if self.preload:
            image, label = self.images[index], self.labels[index]
        else:
            image_id = self.files[index]
            image, label = self._load_data(image_id)
        image, label = self._transform(image, label)
        return image.astype(np.float32), label.astype(np.int64)

    def _load_data(self, image_id):
        # Set paths
        image_path = osp.join(self.root, "images", image_id + ".jpg")
        label_path = osp.join(self.root, "annotations", image_id + ".mat")
        # Load an image
        image = cv2.imread(image_path, cv2.IMREAD_COLOR).astype(np.float32)
        # Load a label map
        if self.version == "1.1":
            label = sio.loadmat(label_path)["S"].astype(np.int64)
            label -= 1  # unlabeled (0 -> -1)
        elif self.version == "1.0":
            label = np.array(h5py.File(label_path, "r")["S"], dtype=np.int64)
            label = label.transpose(1, 0)
            label -= 2  # unlabeled (1 -> -1)
        else:
            raise NotImplementedError(
                "1.0 or 1.1 expected, but got: {}".format(self.version)
            )
        return image, label

    def _transform(self, image, label):
        # Mean subtraction
        image -= self.mean
        # Pre-scaling
        if self.warp:
            base_size = (self.base_size,) * 2
        else:
            raw_h, raw_w = label.shape
            if raw_h > raw_w:
                base_size = (int(self.base_size * raw_w / raw_h), self.base_size)
            else:
                base_size = (self.base_size, int(self.base_size * raw_h / raw_w))
        image = cv2.resize(image, base_size, interpolation=cv2.INTER_LINEAR)
        label = cv2.resize(label, base_size, interpolation=cv2.INTER_NEAREST)
        if self.scale is not None:
            # Scaling
            scale_factor = random.choice(self.scale)
            scale_kwargs = {"dsize": None, "fx": scale_factor, "fy": scale_factor}
            image = cv2.resize(image, interpolation=cv2.INTER_LINEAR, **scale_kwargs)
            label = cv2.resize(label, interpolation=cv2.INTER_NEAREST, **scale_kwargs)
            scale_h, scale_w = label.shape
            # Padding
            pad_h = max(max(base_size[1], self.crop_size) - scale_h, 0)
            pad_w = max(max(base_size[0], self.crop_size) - scale_w, 0)
            pad_kwargs = {
                "top": 0,
                "bottom": pad_h,
                "left": 0,
                "right": pad_w,
                "borderType": cv2.BORDER_CONSTANT,
            }
            if pad_h > 0 or pad_w > 0:
                image = cv2.copyMakeBorder(image, value=(0.0, 0.0, 0.0), **pad_kwargs)
                label = cv2.copyMakeBorder(label, value=self.ignore_label, **pad_kwargs)
            # Random cropping
            base_h, base_w = label.shape
            start_h = random.randint(0, base_h - self.crop_size)
            start_w = random.randint(0, base_w - self.crop_size)
            end_h = start_h + self.crop_size
            end_w = start_w + self.crop_size
            image = image[start_h:end_h, start_w:end_w]
            label = label[start_h:end_h, start_w:end_w]
        if self.flip:
            # Random flipping
            if random.random() < 0.5:
                image = np.fliplr(image).copy()  # HWC
                label = np.fliplr(label).copy()  # HW
        # HWC -> CHW
        image = image.transpose(2, 0, 1)
        return image, label

- 使用opencv进行读图;

- 都没有进行随机翻转操作,可能对旋转后会产生黑色区域;

- 针对这些问题:在以后的实际项目中注意操作用法,训练网络,查看实际数据增强对任务的提升效果!!!

posted @ 2018-12-18 11:57  ranjiewen  阅读(3523)  评论(0编辑  收藏  举报