Faster RCNN之数据准备

数据介绍

Pascal Voc数据集是目前目标检测及分割中常用的数据集, 共包含20个类别。Voc数据集的目录结构如下。

Annotations: 存放了图片的标注信息，以xml文件形式保存。

ImageSets：
    a. 子文件夹Action存放了各种人的动作的数据集。
    b. 子文件夹Layout存放了人体不同部位的数据集。
    c. 子文件夹Main存放了物体分类的的数据集。
    d. 子文件夹Segmentation存放了物体分割的的数据集。
    
JPEGImages： 存放了jpg格式的图片文件。

SegmentationClass： 存放了按照类别分割的图片。

SegmentObject： 存放了按物体分割的图片。

在目标检测任务中，只需要ImageSets/Main、Annotations和JPEGImages即可。

标注文件格式如下：

`<annotation>
<folder>VOC2012</folder>
// 对应的图片文件名
<filename>2007_000063.jpg</filename>
<source>
    <database>The VOC2007 Database</database>
    <annotation>PASCAL VOC2007</annotation>
    <image>flickr</image>
</source>
<size>
// 图片宽度
    <width>500</width>
// 图片高度
    <height>375</height>
// 图片深度（通道数）
    <depth>3</depth>
</size>
<segmented>1</segmented>
<object>
// 物体类别
    <name>dog</name>
// 拍摄角度
    <pose>Unspecified</pose>
// 是否被截断，0表示未截断
    <truncated>0</truncated>
// 检测难度，0表示容易
    <difficult>0</difficult>
// 物体边框坐标(x_min, y_min, x_max, y_max)
    <bndbox>
        <xmin>123</xmin>
        <ymin>115</ymin>
        <xmax>379</xmax>
        <ymax>275</ymax>
    </bndbox>
</object>
<object>
    <name>chair</name>
    <pose>Frontal</pose>
    <truncated>1</truncated>
    <difficult>0</difficult>
    <bndbox>
        <xmin>75</xmin>
        <ymin>1</ymin>
        <xmax>428</xmax>
        <ymax>375</ymax>
    </bndbox>
</object></annotation>`

数据集创建
构建数据集的代码如下

import torch as t
from xml.etree import ElementTree as et
from PIL import Image
import os
from torch.utils.data import Dataset
from data.data_aug import *

class VocDataset(Dataset):
    """
    
    
    """
    def __init__(self, data_dir, split='trainval'):
        super(VocDataset, self).__init__()
        # 数据列表名
        data_path = data_dir + 'ImageSets/Main/%s.txt'%split
        
        # 获取数据列表
        data_path = open(data_path).readlines()

        # 数
        self.imgs = [data_dir+'JPEGImages/%s.jpg'%img.strip() for img in data_path \
            if os.path.exists(data_dir+'JPEGImages/%s.jpg'%img.strip())]

        self.label = [data_dir+'Annotations/%s.xml'%img.strip() for img in data_path \
            if os.path.exists(data_dir+'Annotations/%s.xml'%img.strip())]

        # self.img_height = img_height
        # self.img_width = img_width

        assert len(self.imgs) ==  len(self.label), 'images counts must equal with label counts'
        

    def __len__(self):
        return len(self.imgs)
        # pass


    def parse_xml(self, xml_file):

        label_tree = et.fromstring(open(xml_file).read())
        bboxes = []
        label = []
        objects = label_tree.findall('object')
        for obj in objects:
            bbox = obj.find('bndbox')
            label.append(VOC_BBOX_LABEL_NAMES.index(obj.find('name').text))
            # [xmin, ymin, xmax, ymax]
            box = []
            tags = ['xmin', 'ymin', 'xmax', 'ymax']
            for tag in tags:
                box.append(int(bbox.find(tag).text))

            bboxes.append(box)

        return bboxes, label


    def __getitem__(self, idx):
        bboxes, labels = self.parse_xml(self.label[idx])
        bboxes = np.array(bboxes)
        labels = np.array(labels).astype(np.int32)


        img = Image.open(self.imgs[idx])

        trans = Transform()
        img, bboxes, scale = trans(img, bboxes)

        return img, bboxes, t.from_numpy(labels), scale

数据增强的代码如下

import numpy as np
np.random.seed(0)
from skimage import transform 
import torch as t
def reverse_norm(caffe_pretrain):
    """
    reverse normalized image(0-1) to 0-255
    """

    if caffe_pretrain:
        img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))
        return img[::-1, :, :]

    # approximate un-normalize for visualize
    return (img * 0.225 + 0.45).clip(min=0, max=1) * 255
    


def caffe_norm(img):
    # caffe channel order is BGR not RGB
    img = img[:, :, [2, 1, 0]]

    # 
    img = img * 255.

    img = img - np.array([122.7717, 115.9465, 102.9801]).reshape((1, 1, 3))

    return img.astype(np.float32)
    


def standard_norm(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):

    """
    Note: here mean and std are calculate from train set
    usually using calculated mean and std normalize data, can increase converge speed
    """
    mean, std = np.array(mean).reshape((1, 1, 3)), np.array(mean).reshape((1, 1, 3))

    img = (img - mean) / std

    return img.astype(np.float32)


def resize(img, bbox, img_size, size=(1000, 600)):
    """
    Description:

    resize img to make sure lengest side of img no more than 1000 and 
    shortest side of img no more than 600.

    here, no need to resize image to same size, because the batch size is limit to 1


    Params:
        img(numpy.array): image before resize

        bbox(numpy.array): bounding boxes

        img_size(tuple): (image height, image width)

    Return:

        imgResized image

        resized bbox
    """

    if not isinstance(img, np.ndarray):
        img = np.array(img)

    # scale image to 0-1
    img = img / 255.

    # scale of longest side
    scale1 = 1000 / max(img_size)

    # scale of shortest side
    scale2 = 600 / min(img_size)

    # final scale is minimum of scale1 and scale2
    scale = scale2 if scale1 > scale2 else scale1


    # img = transform.resize(img, , (img_size[0] * scale, img_size[1] * scale))(img)
    img = transform.rescale(img, scale=scale, anti_aliasing=True, mode='reflect')

    bbox = bbox * scale

    return img, bbox, scale


def random_flip(img, bbox, orientation='horizontal'):

    """
    Description:
    
    random flip image horizontally or vertically

    Params:
        img(numpy.array): image

        bbox(numpy.array): bounding boxes

        orientation(string): optical param, `horizontal` or `vertival`

    Return:
        img(np.array):

        bbox(np.array):


    """
    

    # flip probability, using 0-1 uniform distribution to sample probability
    flip_prob = np.random.uniform(0, 1)

    # flip horizontally
    if orientation == 'horizontal':
        if flip_prob > 0.5:
            img = img[:, ::-1, :]
            bbox[:, [2, 0]] = img.shape[1] - bbox[:, [0, 2]]
    
    # flip vertically
    else:
        if flip_prob > 0.5:
            img = img[::-1, :, :]
            bbox[:, [3, 1]] = img.shape[0] - bbox[:, [1, 3]]

    return img, bbox


class Transform:

    def __init__(self, min_size=1000, max_size=1000):
        self.min_size = min_size
        self.max_size = max_size
        # self.img = img
        # self.bboxes = bboxes

    def __call__(self, img, bboxes):
        """
        image and bbox transform operations
        """
        # resize
        img, bboxes, scale = resize(img, bboxes, img.size)

        # random flip 
        img, bboxes = random_flip(img, bboxes)

        img = caffe_norm(img)

        return t.from_numpy(img).permute(2, 0, 1), t.from_numpy(bboxes), scale

Faster RCNN之数据准备

mhxin

引用和评论

推荐系统入门-新闻推荐模型排序+模型融合

CNN中的注意力机制综合指南：从理论到Pytorch代码实现

三种Transformer模型中的注意力机制介绍及Pytorch实现：从自注意力到因果自注意力

PyTorch->ONNX->TensorRT，TensorRT官方插件使用demo

MMCA：多模态动态权重更新，视觉定位新SOTA | ACM MM'24 Oral

对Transformer的理解

WTConv：小参数大感受野，基于小波变换的新型卷积 | ECCV'24