@inproceedings{dubba_aaai14,
  author = {Krishna S.R. Dubba and Miguel R. de Oliveira and Gi Hyun Lim and Hamidreza Kasaei and Luis Seabra Lopes and Ana Tome and Anthony G. Cohn},
  title = {Grounding Language in Perception for Scene Conceptualization in Autonomous Robots},
  year = {2014},
  booktitle = {{AAAI Spring Symposium on Qualitative Representations for Robots}},
  publisher = {{AAAI Press}},
  abstract = {{In order to behave autonomously, it is desirable for
robots to have the ability to use human supervision and
learn from different input sources (perception, gestures,
verbal and textual descriptions etc). In many machine
learning tasks, the supervision is directed speci?cally
towards machines and hence is straight forward clearly
annotated examples. But this is not always very practical
and recently it was found that the most preferred
interface to robots is natural language. Also the supervision
might only be available in a rather indirect
form, which may be vague and incomplete. This is frequently
the case when humans teach other humans since
they may assume a particular context and existing world
knowledge. We explore this idea here in the setting of
conceptualizing objects and scene layouts. Initially the
robot undergoes training from a human in recognizing
some objects in the world and armed with this acquired
knowledge it sets out in the world to explore and learn
more higher level concepts like static scene layouts and
environment activities. Here it has to exploit its learned
knowledge and ground language into perception to use
inputs from different sources that might have overlapping
as well as novel information. When exploring, we
assume that the robot is given visual input, without explicit
type labels for objects, and also that it has access
to more or less generic linguistic descriptions of scene layout.
Thus our task here is to learn the spatial structure of
a scene layout and simultaneously visual object
models it was not trained on. In this paper, we present a
cognitive architecture and learning framework for robot
learning through natural human supervision and using
multiple input sources by grounding language in perception.}}
}