The Grounded Language Dataset (GoLD) is a multimodal dataset of RGB+depth images of common household objects, along with English natural language descriptions in multiple formats: text, speech (audio), and speech transcriptions. GoLD is comprised of RGB and depth point cloud images of 47 classes of objects in five high-level categories. It includes 16500 text and 16500 speech descriptions gathered with Amazon Mechanical Turk (AMT).
The dataset is intended for research at the intersection of robotics, NLP, and HCI and may help researchers investigate how to learn from the multiple modalities of image, depth, text, speech, and transcription interact, as well as how differences in the vernacular of these modalities impact results.
@inproceedings{kebe2021a,
title = {A Spoken Language Dataset of Descriptions for Speech-Based Grounded Language Learning},
author = {Gaoussou Youssouf Kebe and Padraig Higgins and Patrick Jenkins and Kasra Darvish and Rishabh Sachdeva and Ryan Barron and John Winder and Donald Engel and Edward Raff and Francis Ferraro and Cynthia Matuszek},
booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)},
year = {2021},
url = {https://openreview.net/forum?id=Yx9jT3fkBaD}
}