IS = { zkontrolovano 25 Jan 2014 },
  UPDATE  = { 2014-01-06 },
    title = {Active-Speaker Detection and Localization with Microphones and Cameras Embedded into a Robotic Head},
    author = {Cech, Jan and Mittal, Ravi and Deleforge, Antoine and Sanchez-Riera, Jordi and Alameda-Pineda, Xavier and Horaud, Radu P.},
    booktitle = {Proc. Humanoids 2013: IEEE International Conference on Humanoid Robots},
    ISBN = {978-1-4799-2618-3},
    venue = {Atlanta, USA},
    publisher = {IEEE Robotics and Automation Society},
    address = {Piscataway, United States},
    year = {2013},
    month = {October},
    day = {15--17},
    pages = {203-210},
    book_pages = {554},
    Annote = {In this paper we present a method for detecting and
                  localizing an active speaker, i.e., a speaker that
                  emits a sound, through the fusion between visual
                  reconstruction with a stereoscopic camera pair and
                  sound-source localization with several
                  microphones. Both the cameras and the microphones
                  are embedded into the head of a humanoid robot. The
                  proposed statistical fusion model associates 3D
                  faces of potential speakers with 2D sound
                  directions. The paper has two contributions: (i) a
                  method that discretizes the two-dimensional space of
                  all possible sound directions and that accumulates
                  evidence for each direction by estimating the time
                  difference of arrival (TDOA) over all the microphone
                  pairs, such that all the microphones are used
                  simultaneously and symmetrically and (ii) an
                  audio-visual alignment method that maps 3D visual
                  features onto 2D sound directions and onto TDOAs
                  between microphone pairs. This allows to implicitly
                  represent both sensing modalities into a common
                  audiovisual coordinate frame. Using simulated as
                  well as real data, we quantitatively assess the
                  robustness of the method against noise and
                  reverberations, and we compare it with several other
                  methods. Finally, we describe a real-time
                  implementation using the proposed technique and with
                  a humanoid head embedding four microphones and two
                  cameras: this enables natural human-robot
                  interactive behavior.},
    keywords = {audio-visual fusion, multi-modal preception, directional hearing, stero-vision},
    project = {FP7-ICT-247525 HUMAVIPS, GACR P103/12/G084},
    pdf = {http://hal.inria.fr/hal-00861465/PDF/main\_final.pdf},
    url = {http://hal.inria.fr/hal-00861465},