@TechReport{Torii-et-al-TR-2008-25,
  IS = { zkontrolovano 25 Feb 2010 },
  UPDATE  = { 2009-03-23 },
  author =      {Torii, Akihiko and Havlena, Michal and Jan{\v c}o{\v s}ek, Michal and 
                 K{\'u}kelov{\'a}, Zuzazna and Pajdla, Tom{\' a}{\v s}},
  title =       {Dynamic {3D} Scene Analysis from Omni-Directional Video Data},
  institution = {Center for Machine Perception, K13133 FEE
                 Czech Technical University},
  address =     {Prague, Czech Republic},
  year =        {2008},
  month =       {December},
  type =        {Research Report},
  number =      {CTU--CMP--2008--25},
  issn =        {1213-2365},
  pages =       {54},
  figures =     {28},
  authorship =  {20-20-20-20-20},
  psurl       = {[Torii-et-al-TR-2008-25.pdf]},
  project =     {FP6-IST-027787, FP7-SPACE-218814 PRoVisG, MSM6840770038},
  annote = {In this report we present several contributions that
    support image and video analysis for dynamic 3D scene analysis
    from omnidirectional video data.  First, we describe a pipeline
    for camera pose and trajectory estimation, and image stabilization
    and rectification for dense as well as wide baseline omnid
    irectional images. The input is a set of images taken by a single
    hand-held camera. The output is a set of stabilized and rectified
    images augmented by the computed camera 3D trajectory and
    reconstruction of feature points facilitating visual object
    recognition. We generalize previous works on camera trajectory
    estimation done on perspective images to omnidirectional images
    and introduces a new technique for omnidirectional image
    rectification that is suited for recognizing people and cars in
    images. The perfor mance of the pipeline is demonstrated on a real
    image sequence acquired in urban as well as natural
    environments. The approach has been used to eliminate unwanted
    rotations of a mobile camera, images were stabilized to allow for
    using the ground plane constraint in pedestrian recognition.
    Then, a new efficient technique for large-scale structure from
    motion from unordered data sets is proposed. In this technique we
    avoid costly computation o f all pairwise matches and geometries
    by sampling pairs of images using the pairwise similarity scores
    based on the detected occurrences of visual words leading to a
    signif icant speedup. Furthermore, atomic 3D models reconstructed
    from camera triplets are used as the seeds which form the final
    large-scale 3D model when merged together. Using three views
    instead of two allows us to reveal most of the outliers of
    pairwise geometries at an early stage of the process hindering
    them from derogating the quality of th e resulting 3D structure at
    later stages. The accuracy of the proposed technique is shown on a
    set of 64 images where the result of the exhaustive technique is
    known. Scala bility is demonstrated on a landmark reconstruction
    from hundreds of images. The technique has been successfully
    tested on images acquired by a mobile AWEAR platform. It has been
    observed that so called ``loop closing'' appeared spontaneously at
    many places.  Third, a scalable multi-view stereo reconstruction
    method which can deal with a large number of large images in
    affordable time and effort is presented in more detail. The
    computational effort of our technique is a function of the surface
    area of the observed scene which is conveniently discretized to
    represent sufficient but not excessive detail. Our technique works
    as a filter on a limited number of images at a time and can thus
    process arbitrarily large data sets using limited memory. By bui
    lding reconstructions gradually, we avoid unnecessary processing
    of data which bring little improvement. In experiments with
    Middlebury and Strecha's databases, we demonstr ate that we
    achieve results comparable to the state of the art with
    considerably smaller effort and time used by previous methods. We
    present a large scale experiment in wh ich we processed 1000
    images from the Google Street View Pittsburgh Experimental Data
    Set. Large scale dense stereo reconstruction from many images has
    been previously foun d vital for detecting inconsistencies in
    camera positions and camera calibrations that accumulate in large
    data (especially sequences) over time. The method has been used t
    o investigate the accuracy of the camera tracking for analysis of
    dynamic 3D scenes using the AWEAR platform.  Finally, we
    investigated a possibility to deal with two cameras like with a
    single generalized non-central camera. The first experiments with
    Stewenius and Byroed et al algorithms for estimating relative
    camera pose from image matches has been tried. We can conclude
    that this is a promising approach to camera rig tracking whi ch
    needs to be investigated further.},
  keywords =    {Structure from Motion, Omnidirectional Vision},
}