The COCO dataset is a large-scale benchmark for computer vision tasks, containing images of complex everyday scenes containing common objects in their natural context. It contains a total of 2.5 million labeled instances in 328k images, featuring 91 objects types that would be easily recognizable by a 4 year old. Objects are labeled using per-instance segmentations to aid in precise object localization.
@inproceedings{lin2014microsoft,
title = {Microsoft COCO: Common Objects in Context},
author = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Dollár, Piotr and Zitnick, C Lawrence},
booktitle = {European Conference on Computer Vision},
year = {2014},
organization = {Springer}
}