- 数据介绍
Pascal Voc数据集是目前目标检测及分割中常用的数据集, 共包含20个类别。Voc数据集的目录结构如下。
Annotations: 存放了图片的标注信息,以xml文件形式保存。
ImageSets:
a. 子文件夹Action存放了各种人的动作的数据集。
b. 子文件夹Layout存放了人体不同部位的数据集。
c. 子文件夹Main存放了物体分类的的数据集。
d. 子文件夹Segmentation存放了物体分割的的数据集。
JPEGImages: 存放了jpg格式的图片文件。
SegmentationClass: 存放了按照类别分割的图片。
SegmentObject: 存放了按物体分割的图片。
在目标检测任务中,只需要ImageSets/Main、Annotations和JPEGImages即可。
标注文件格式如下:
`<annotation>
<folder>VOC2012</folder>
// 对应的图片文件名
<filename>2007_000063.jpg</filename>
<source>
<database>The VOC2007 Database</database>
<annotation>PASCAL VOC2007</annotation>
<image>flickr</image>
</source>
<size>
// 图片宽度
<width>500</width>
// 图片高度
<height>375</height>
// 图片深度(通道数)
<depth>3</depth>
</size>
<segmented>1</segmented>
<object>
// 物体类别
<name>dog</name>
// 拍摄角度
<pose>Unspecified</pose>
// 是否被截断,0表示未截断
<truncated>0</truncated>
// 检测难度,0表示容易
<difficult>0</difficult>
// 物体边框坐标(x_min, y_min, x_max, y_max)
<bndbox>
<xmin>123</xmin>
<ymin>115</ymin>
<xmax>379</xmax>
<ymax>275</ymax>
</bndbox>
</object>
<object>
<name>chair</name>
<pose>Frontal</pose>
<truncated>1</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>75</xmin>
<ymin>1</ymin>
<xmax>428</xmax>
<ymax>375</ymax>
</bndbox>
</object></annotation>`
- 数据集创建
构建数据集的代码如下
import torch as t
from xml.etree import ElementTree as et
from PIL import Image
import os
from torch.utils.data import Dataset
from data.data_aug import *
class VocDataset(Dataset):
"""
"""
def __init__(self, data_dir, split='trainval'):
super(VocDataset, self).__init__()
# 数据列表名
data_path = data_dir + 'ImageSets/Main/%s.txt'%split
# 获取数据列表
data_path = open(data_path).readlines()
# 数
self.imgs = [data_dir+'JPEGImages/%s.jpg'%img.strip() for img in data_path \
if os.path.exists(data_dir+'JPEGImages/%s.jpg'%img.strip())]
self.label = [data_dir+'Annotations/%s.xml'%img.strip() for img in data_path \
if os.path.exists(data_dir+'Annotations/%s.xml'%img.strip())]
# self.img_height = img_height
# self.img_width = img_width
assert len(self.imgs) == len(self.label), 'images counts must equal with label counts'
def __len__(self):
return len(self.imgs)
# pass
def parse_xml(self, xml_file):
label_tree = et.fromstring(open(xml_file).read())
bboxes = []
label = []
objects = label_tree.findall('object')
for obj in objects:
bbox = obj.find('bndbox')
label.append(VOC_BBOX_LABEL_NAMES.index(obj.find('name').text))
# [xmin, ymin, xmax, ymax]
box = []
tags = ['xmin', 'ymin', 'xmax', 'ymax']
for tag in tags:
box.append(int(bbox.find(tag).text))
bboxes.append(box)
return bboxes, label
def __getitem__(self, idx):
bboxes, labels = self.parse_xml(self.label[idx])
bboxes = np.array(bboxes)
labels = np.array(labels).astype(np.int32)
img = Image.open(self.imgs[idx])
trans = Transform()
img, bboxes, scale = trans(img, bboxes)
return img, bboxes, t.from_numpy(labels), scale
数据增强的代码如下
import numpy as np
np.random.seed(0)
from skimage import transform
import torch as t
def reverse_norm(caffe_pretrain):
"""
reverse normalized image(0-1) to 0-255
"""
if caffe_pretrain:
img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))
return img[::-1, :, :]
# approximate un-normalize for visualize
return (img * 0.225 + 0.45).clip(min=0, max=1) * 255
def caffe_norm(img):
# caffe channel order is BGR not RGB
img = img[:, :, [2, 1, 0]]
#
img = img * 255.
img = img - np.array([122.7717, 115.9465, 102.9801]).reshape((1, 1, 3))
return img.astype(np.float32)
def standard_norm(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
"""
Note: here mean and std are calculate from train set
usually using calculated mean and std normalize data, can increase converge speed
"""
mean, std = np.array(mean).reshape((1, 1, 3)), np.array(mean).reshape((1, 1, 3))
img = (img - mean) / std
return img.astype(np.float32)
def resize(img, bbox, img_size, size=(1000, 600)):
"""
Description:
resize img to make sure lengest side of img no more than 1000 and
shortest side of img no more than 600.
here, no need to resize image to same size, because the batch size is limit to 1
Params:
img(numpy.array): image before resize
bbox(numpy.array): bounding boxes
img_size(tuple): (image height, image width)
Return:
imgResized image
resized bbox
"""
if not isinstance(img, np.ndarray):
img = np.array(img)
# scale image to 0-1
img = img / 255.
# scale of longest side
scale1 = 1000 / max(img_size)
# scale of shortest side
scale2 = 600 / min(img_size)
# final scale is minimum of scale1 and scale2
scale = scale2 if scale1 > scale2 else scale1
# img = transform.resize(img, , (img_size[0] * scale, img_size[1] * scale))(img)
img = transform.rescale(img, scale=scale, anti_aliasing=True, mode='reflect')
bbox = bbox * scale
return img, bbox, scale
def random_flip(img, bbox, orientation='horizontal'):
"""
Description:
random flip image horizontally or vertically
Params:
img(numpy.array): image
bbox(numpy.array): bounding boxes
orientation(string): optical param, `horizontal` or `vertival`
Return:
img(np.array):
bbox(np.array):
"""
# flip probability, using 0-1 uniform distribution to sample probability
flip_prob = np.random.uniform(0, 1)
# flip horizontally
if orientation == 'horizontal':
if flip_prob > 0.5:
img = img[:, ::-1, :]
bbox[:, [2, 0]] = img.shape[1] - bbox[:, [0, 2]]
# flip vertically
else:
if flip_prob > 0.5:
img = img[::-1, :, :]
bbox[:, [3, 1]] = img.shape[0] - bbox[:, [1, 3]]
return img, bbox
class Transform:
def __init__(self, min_size=1000, max_size=1000):
self.min_size = min_size
self.max_size = max_size
# self.img = img
# self.bboxes = bboxes
def __call__(self, img, bboxes):
"""
image and bbox transform operations
"""
# resize
img, bboxes, scale = resize(img, bboxes, img.size)
# random flip
img, bboxes = random_flip(img, bboxes)
img = caffe_norm(img)
return t.from_numpy(img).permute(2, 0, 1), t.from_numpy(bboxes), scale
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。