The Grounded Language Dataset (GoLD), a multimodal dataset of common household objects described by people using either spoken or written language. GoLD is comprised of RGB and depth point cloud images of 47 classes of objects in five high-level categories. It includes 8250 text and 4059 speech descriptions gathered with Amazon Mechanical Turk (AMT).
@inproceedings{kebe2021a,
title = {A Spoken Language Dataset of Descriptions for Speech-Based Grounded Language Learning},
author = {Gaoussou Youssouf Kebe and Padraig Higgins and Patrick Jenkins and Kasra Darvish and Rishabh Sachdeva and Ryan Barron and John Winder and Donald Engel and Edward Raff and Francis Ferraro and Cynthia Matuszek},
booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)},
year = {2021},
url = {https://openreview.net/forum?id=Yx9jT3fkBaD}
}