conf.bib
@COMMENT{{{This file has been generated by bib2bib 1.74}}
@COMMENT{{{Command line: bib2bib-1.74.exe -ob conf.bib -c $type='INPROCEEDINGS' -s year -r mediamill.bib}}
@INPROCEEDINGS{SandeCVGPU10,
AUTHOR = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
TITLE = {Accelerating Visual Categorization with the {GPU}},
BOOKTITLE = {{ECCV} Workshop on Computer Vision on {GPU}},
PAGES = {},
MONTH = {September},
YEAR = {2010},
ADDRESS = {Crete, Greece},
PDF = {},
ABSTRACT = {
}
}
@INPROCEEDINGS{HuurninkCIVR10,
AUTHOR = {Bouke Huurnink and Cees G. M. Snoek and Maarten {de Rijke} and Arnold W. M. Smeulders},
TITLE = {Today's and Tomorrow's Retrieval Practice in the Audiovisual Archive},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Image and Video Retrieval},
PAGES = {18--25},
MONTH = {July},
YEAR = {2010},
ADDRESS = {Xi'an, China},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/huurnink-archive-civr2010.pdf},
ABSTRACT = {
Content-based video retrieval is maturing to the point where
it can be used in real-world retrieval practices. One such
practice is the audiovisual archive, whose users increasingly
require fine-grained access to broadcast television content.
We investigate to what extent content-based video retrieval
methods can improve search in the audiovisual archive. In
particular, we propose an evaluation methodology tailored to
the specific needs and circumstances of the audiovisual archive,
which are typically missed by existing evaluation initiatives.
We utilize logged searches and content purchases from an
existing audiovisual archive to create realistic query sets
and relevance judgments. To reflect the retrieval practice of
both the archive and the video retrieval community as closely
as possible, our experiments with three video search engines
incorporate archive-created catalog entries as well as
state-of-the-art multimedia content analysis results. We find
that incorporating content-based video retrieval into the
archive’s practice results in significant performance increases
for shot retrieval and for retrieving entire television programs.
Our experiments also indicate that individual content-based
retrieval methods yield approximately equal performance gains.
We conclude that the time has come for audiovisual archives to
start accommodating content-based video retrieval methods into
their daily practice.
}
}
@INPROCEEDINGS{LiACM09,
AUTHOR = {Xirong Li and Cees G. M. Snoek},
TITLE = {Visual Categorization with Negative Examples for Free},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Multimedia},
PAGES = {},
MONTH = {October},
YEAR = {2009},
ADDRESS = {Beijing, China},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/li-negative-for-free-acm2009.pdf},
ABSTRACT = {
Automatic visual categorization is critically dependent on labeled examples
for supervised learning. As an alternative to traditional expert labeling,
social-tagged multimedia is becoming a novel yet subjective and inaccurate
source of learning examples. Different from existing work focusing on collecting
positive examples, we study in this paper the potential of substituting social
tagging for expert labeling for creating negative examples. We present an
empirical study using 6.5 million Flickr photos as a source of social tagging.
Our experiments on the PASCAL VOC challenge 2008 show that with a relative loss
of only 4.3\% in terms of mean average precision, expert-labeled negative
examples can be completely replaced by social-tagged negative examples for
consumer photo categorization.
}
}
@INPROCEEDINGS{SetzICME09,
AUTHOR = {Arjan T. Setz and Cees G. M. Snoek},
TITLE = {Can Social Tagged Images Aid Concept-Based Video Search?},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {1460--1463},
MONTH = {June--July},
YEAR = {2009},
ADDRESS = {},
NOTE = {Invited paper.},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/setz-social-tags-icme2009.pdf},
ABSTRACT = {
This paper seeks to unravel whether commonly available social tagged
images can be exploited as a training resource for concept-based
video search. Since social tags are known to be ambiguous, overly
personalized, and often error prone, we place special emphasis on
the role of disambiguation. We present a systematic experimental
study that evaluates concept detectors based on social tagged
images, and their disambiguated versions, in three application
scenarios: within-domain, cross-domain, and together with an
interacting user. The results indicate that social tagged images can
aid concept-based video search indeed, especially after
disambiguation and when used in an interactive video retrieval
setting. These results open-up interesting avenues for future
research.
}
}
@INPROCEEDINGS{LiICASSP09,
AUTHOR = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
TITLE = {Annotating Images by Harnessing Worldwide User-Tagged Photos},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
PAGES = {},
MONTH = {April},
YEAR = {2009},
ADDRESS = {Taipei, Taiwan},
NOTE = {Invited paper.},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/li-annotating-images-icassp2009.pdf},
ABSTRACT = {
Automatic image tagging is important yet challenging due to the
semantic gap and the lack of learning examples to model a tag's
visual diversity. Meanwhile, social user tagging is creating rich
multimedia content on the web. In this paper, we propose to combine
the two tagging approaches in a search-based framework. For an
unlabeled image, we first retrieve its visual neighbors from a large
user-tagged image database. We then select relevant tags from the
result images to annotate the unlabeled image. To tackle the
unreliability and sparsity of user tagging, we introduce a
joint-modality tag relevance estimation method which efficiently
addresses both textual and visual clues. Experiments on 1.5 million
Flickr photos and 10 000 Corel images verify the proposed method.
}
}
@INPROCEEDINGS{ByrneSAMT08,
AUTHOR = {Daragh Byrne and Aiden R. Doherty and Cees G. M. Snoek and Gareth J. F. Jones and Alan F. Smeaton},
TITLE = {Validating the Detection of Everyday Concepts in Visual Lifelogs},
BOOKTITLE = {Proceedings of the International Conference on Semantic and Digital Media Technologies, SAMT 2008, Koblenz, Germany, December 3-5, 2008},
PUBLISHER = {Springer-Verlag},
SERIES = {LNCS},
EDITORS = {},
PAGES = {15--30},
MONTH = {December},
YEAR = {2008},
ADDRESS = {Berlin, Germany},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/byrne-everyday-concepts-samt2008.pdf},
ABSTRACT = {
The Microsoft SenseCam is a small lightweight wearable camera used
to passively capture photos and other sensor readings from a user's
day-to-day activities. It can capture up to 3,000 images per day,
equating to almost 1 million images per year. It is used to aid memory
by creating a personal multimedia lifelog, or visual recording of
the wearer's life. However the sheer volume of image data captured
within a visual lifelog creates a number of challenges, particularly
for locating relevant content. Within this work, we explore the
applicability of semantic concept detection, a method often used
within video retrieval, on the novel domain of visual lifelogs. A
concept detector models the correspondence between low-level visual
features and high-level semantic concepts (such as indoors, outdoors,
people, buildings, etc.) using supervised machine learning. By doing
so it determines the probability of a concept's presence. We apply
detection of 27 everyday semantic concepts on a lifelog collection
composed of 257,518 SenseCam images from 5 users. The results were
then evaluated on a subset of 95,907 images, to determine the
precision for detection of each semantic concept and to draw some
interesting inferences on the lifestyles of those 5 users. We
additionally present future applications of concept detection within
the domain of lifelogging.
}
}
@INPROCEEDINGS{LiMIR08,
AUTHOR = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
TITLE = {Learning Tag Relevance by Neighbor Voting for Social Image Retrieval},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Multimedia Information Retrieval},
PAGES = {180--187},
MONTH = {October},
YEAR = {2008},
ADDRESS = {Vancouver, Canada},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/li-tag-relevance-mir2008.pdf},
ABSTRACT = {
Social image retrieval is important for exploiting the increasing
amounts of amateur-tagged multimedia such as Flickr images. Since
amateur tagging is known to be uncontrolled, ambiguous, and
personalized, a fundamental problem is how to reliably interpret
the relevance of a tag with respect to the visual content it is
describing. Intuitively, if different persons label similar images
using the same tags, these tags are likely to reflect objective
aspects of the visual content. Starting from this intuition, we
propose a novel algorithm that scalably and reliably learns tag
relevance by accumulating votes from visually similar neighbors.
Further, treated as tag frequency, learned tag relevance is
seamlessly embedded into current tag-based social image retrieval
paradigms. Preliminary experiments on one million Flickr images
demonstrate the potential of the proposed algorithm. Overall
comparisons for both single-word queries and multiple-word queries
show substantial improvement over the baseline by learning and using
tag relevance. Specifically, compared with the baseline using the
original tags, on average, retrieval using improved tags increases
mean average precision by 24\%, from 0.54 to 0.67. Moreover,
simulated experiments indicate that performance can be improved
further by scaling up the amount of images used in the proposed
neighbor voting algorithm.
}
}
@INPROCEEDINGS{RooijCIVR08,
AUTHOR = {Ork de Rooij and Cees G. M. Snoek and Marcel Worring},
TITLE = {Balancing Thread Based Navigation for Targeted Video Search},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Image and Video Retrieval},
PAGES = {485--494},
MONTH = {July},
YEAR = {2008},
ADDRESS = {Niagara Falls, Canada},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/rooij-thread-based-navigation-civr2008.pdf},
ABSTRACT = {
Various query methods for video search exist. Because of the
semantic gap each method has its limitations. We argue that
for effective retrieval query methods need to be combined at
retrieval time. However, switching query methods often involves
a change in query and browsing interface, which puts
a heavy burden on the user. In this paper, we propose a
novel method for fast and effective search trough large video
collections by embedding multiple query methods into a single
browsing environment. To that end we introduced the
notion of query threads, which contain a shot-based ranking
of the video collection according to some feature-based
similarity measure. On top of these threads we define several
thread-based visualizations, ranging from fast targeted
search to very broad exploratory search, with the ForkBrowser
as the balance between fast search and video space
exploration. We compare the effectiveness and efficiency of
the ForkBrowser with the CrossBrowser on the TRECVID
2007 interactive search task. Results show that different
query methods are needed for different types of search topics,
and that the ForkBrowser requires signifficantly less user
interactions to achieve the same result as the CrossBrowser.
In addition, both browsers rank among the best interactive
retrieval systems currently available.
}
}
@INPROCEEDINGS{SandeCIVR08,
AUTHOR = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
TITLE = {A Comparison of Color Features for Visual Concept Classification},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Image and Video Retrieval},
PAGES = {141--149},
MONTH = {July},
YEAR = {2008},
ADDRESS = {Niagara Falls, Canada},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/sande-colorfeatures-civr2008.pdf},
ABSTRACT = {
Concept classification is important to access visual information on
the level of objects and scene types. So far, intensity-based features
have been widely used. To increase discriminative power, color
features have been proposed only recently. As many features exist,
a structured overview is required of color features in the context of
concept classification.
Therefore, this paper studies 1. the invariance properties and
2. the distinctiveness of color features in a structured way. The
invariance properties of color features with respect to photometric
changes are summarized. The distinctiveness of color features is
assessed experimentally using an image and a video benchmark:
the PASCAL VOC Challenge 2007 and the Mediamill Challenge.
Because color features cannot be studied independently from the
points at which they are extracted, different point sampling strategies
based on Harris-Laplace salient points, dense sampling and the
spatial pyramid are also studied.
From the experimental results, it can be derived that invariance
to light intensity changes and light color changes affects concept
classification. The results reveal further that the usefulness of
invariance is concept-specific.
}
}
@INPROCEEDINGS{SandeCVPR08,
AUTHOR = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
TITLE = {Evaluation of Color Descriptors for Object and Scene Recognition},
BOOKTITLE = {Proceedings of the {IEEE} Computer Society Conference on Computer Vision and Pattern Recognition},
PAGES = {},
MONTH = {June},
YEAR = {2008},
ADDRESS = {Anchorage, Alaska},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/sande-colordescriptors-cvpr2008.pdf},
ABSTRACT = {
Image category recognition is important to access visual
information on the level of objects and scene types. So far,
intensity-based descriptors have been widely used. To increase
illumination invariance and discriminative power,
color descriptors have been proposed only recently. As
many descriptors exist, a structured overview of color invariant
descriptors in the context of image category recognition
is required.
Therefore, this paper studies the invariance properties
and the distinctiveness of color descriptors in a structured
way. The invariance properties of color descriptors are
shown analytically using a taxonomy based on invariance
properties with respect to photometric transformations. The
distinctiveness of color descriptors is assessed experimentally
using two benchmarks from the image domain and the
video domain.
From the theoretical and experimental results, it can be
derived that invariance to light intensity changes and light
color changes affects category recognition. The results reveal
further that, for light intensity changes, the usefulness
of invariance is category-specific.
}
}
@INPROCEEDINGS{SandeCGIV08,
AUTHOR = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
TITLE = {Color Descriptors for Object Category Recognition},
BOOKTITLE = {Proceedings of the {IS\&T} European Conference on Colour in Graphics, Imaging, and Vision},
PAGES = {},
MONTH = {June},
YEAR = {2008},
ADDRESS = {Terrassa-Barcelona, Spain},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/sande-color-descriptors-cgiv2008.pdf},
ABSTRACT = {
Category recognition is important to access visual information
on the level of objects. A common approach is to compute
image descriptors first and then to apply machine learning to
achieve category recognition from annotated examples. As a
consequence,the choice of image descriptors is of great influence
on the recognition accuracy. So far, intensity-based (e.g. SIFT)
descriptors computed at salient points have been used. However,
color has been largely ignored. The question is, can color
information improve accuracy of category recognition?
Therefore, in this paper, we will extend both salient point
detection and region description with color information. The
extension of color descriptors is integrated into the framework
of category recognition enabling to select both intensity and
color variants. Our experiments on an image benchmark show that
category recognition benefits from the use of color. Moreover,
the combination of intensity and color descriptors yields a 30\%
improvement over intensity features alone.
}
}
@INPROCEEDINGS{RooijACM07,
AUTHOR = {Ork de Rooij and Cees G. M. Snoek and Marcel Worring},
TITLE = {Query on Demand Video Browsing},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Multimedia},
PAGES = {811--814},
MONTH = {September},
YEAR = {2007},
ADDRESS = {Augsburg, Germany},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/rooij-rotor-acm2007.pdf},
ABSTRACT = {
This paper describes a novel method for browsing a large
collection of news video by linking various forms of related
video fragments together as threads. Each thread contains
a sequence of shots with high feature-based similarity. Two
interfaces are designed which use threads as the basis for
browsing. One interface shows a minimal set of threads,
and the other as many as possible. Both interfaces are
evaluated in the TRECVID interactive retrieval task, where
they ranked among the best interactive retrieval systems
currently available. The results indicate that the use of
threads in interactive video search is very beneficial. We
have found that in general the query result and the timeline
are the most important threads. However, having several
additional threads allow a user to find unique results which
cannot easily be found by using query results and time alone.
}
}
@INPROCEEDINGS{SmeuldersICIAP07,
AUTHOR = {Arnold W. M. Smeulders and Jan C. van Gemert and Bouke Huurnink and Dennis C. Koelma and Ork de Rooij and Koen E. A. van de Sande and Cees G. M. Snoek and Cor J. Veenman and Marcel Worring},
TITLE = {Semantic Video Search},
BOOKTITLE = {International Conference on Image Analysis and Processing},
PAGES = {},
MONTH = {September},
YEAR = {2007},
ADDRESS = {Modena, Italy},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/smeulders-search-iciap2007.pdf},
ABSTRACT = {
In this paper we describe the current performance of our MediaMill
system as presented in the TRECVID 2006 benchmark for video search
engines. The MediaMill team participated in two tasks: concept
detection and search. For concept detection we use the MediaMill
Challenge as experimental platform. The MediaMill Challenge divides
the generic video indexing problem into a visual-only, textual-only,
early fusion, late fusion, and combined analysis experiment. We
provide a baseline implementation for each experiment together with
baseline results. We extract image features, on global, regional,
and keypoint level, which we combine with various supervised
learners. A late fusion approach of visual-only analysis methods
using geometric mean was our most successful run. With this run we
conquer the Challenge baseline by more than 50\%. Our concept
detection experiments have resulted in the best score for three
concepts: i.e. \emph{desert}, \emph{flag us}, and \emph{charts}.
What is more, using LSCOM annotations, our visual-only approach
generalizes well to a set of 491 concept detectors. To handle such a
large thesaurus in retrieval, an engine is developed which allows
users to select relevant concept detectors based on interactive
browsing using advanced visualizations. Similar to previous years
our best interactive search runs yield top performance, ranking 2nd
and 6th overall.
}
}
@INPROCEEDINGS{SnoekICME07b,
AUTHOR = {Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders and Bauke Freiburg},
TITLE = {The Role of Visual Content and Style for Concert Video Indexing},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {252--255},
MONTH = {July},
YEAR = {2007},
ADDRESS = {Beijing, China},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-fabchannel-icme2007.pdf},
ABSTRACT = {
This paper contributes to the automatic indexing of concert video.
In contrast to traditional methods, which rely primarily on audio
information for summarization applications, we explore how a
visual-only concept detection approach could be employed. We
investigate how our recent method for news video indexing -- which
takes into account the role of content and style -- generalizes to
the concert domain. We analyze concert video on three levels of
visual abstraction, namely: content, style, and their fusion.
Experiments with 12 concept detectors, on 45 hours of visually
challenging concert video, show that the automatically learned best
approach is concept-dependent. Moreover, these results suggest that
the visual modality provides ample opportunity for more effective
indexing and retrieval of concert video when used in addition to the
auditory modality.
}
}
@INPROCEEDINGS{SnoekICME07a,
AUTHOR = {Cees G. M. Snoek and Marcel Worring},
TITLE = {Are Concept Detector Lexicons Effective for Video Search?},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {1966--1969},
MONTH = {July},
YEAR = {2007},
ADDRESS = {Beijing, China},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-concept-icme2007.pdf},
ABSTRACT = {
Until now, systematic studies on the effectiveness of concept
detectors for video search have been carried out using less than 20
detectors, or in combination with other retrieval techniques. We
investigate whether video search using just large concept detector
lexicons is a viable alternative for present day approaches. We
demonstrate that increasing the number of concept detectors in a
lexicon yields improved video retrieval performance indeed. In
addition, we show that combining concept detectors at query time has
the potential to boost performance further. We obtain the
experimental evidence on the automatic video search task of TRECVID
2005 using 363 machine learned concept detectors.
}
}
@INPROCEEDINGS{WorringICASSP07,
AUTHOR = {Marcel Worring and Cees G. M. Snoek and Ork de Rooij and Giang P. Nguyen and Arnold W. M. Smeulders},
TITLE = {The {MediaMill} Semantic Video Search Engine},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
PAGES = {--},
MONTH = {April},
YEAR = {2007},
ADDRESS = {Honolulu, Hawaii, USA},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/worring-mediamill-icassp2007.pdf},
NOTE = {\emph{Invited paper}},
ABSTRACT = {
In this paper we present the methods underlying the MediaMill
semantic video search engine. The basis for the engine
is a semantic indexing process which is currently based on
a lexicon of 491 concept detectors. To support the user in
navigating the collection, the system defines a visual similarity
space, a semantic similarity space, a semantic thread
space, and browsers to explore them. We compare the different
browsers and their utility within the TRECVID benchmark.
In 2005, We obtained a top-3 result for 19 out of 24
search topics. In 2006 for 14 out of 24.
}
}
@INPROCEEDINGS{NguyenMIR06,
AUTHOR = {Giang P. Nguyen and Marcel Worring and Arnold W. M. Smeulders},
TITLE = {Similarity learning via dissimilarity space in {CBIR}},
BOOKTITLE = {Proceedings of the {ACM} {SIGMM} International Workshop on Multimedia Information Retrieval},
PAGES = {107--116},
MONTH = {October},
YEAR = {2006},
ADDRESS = {Santa Barbara, USA},
PDF = {http://www.science.uva.nl/research/mediamill/pub/nguyen-dissimilarity-mir2006.pdf},
ABSTRACT = {
In this paper, we introduce a new approach to learn dissimilarity for
interactive search in content based image retrieval. In literature,
dissimilarity is often learned via the feature space by feature selection,
feature weighting or a parameterized function of the features. Different
from existing techniques, we use relevance feedback to adjust dissimilarity
in a dissimilarity space. To create a dissimilarity space, we use
Pekalska’s method [15]. After the user gives feedback, we apply active
learning with one-class SVM on this space. Results on a Corel dataset
of 10000 images and a TrecVid collection of 43907 keyframes show that
our proposed approach can improve the retrieval performance over the
feature space based approach.
}
}
@INPROCEEDINGS{SnoekACM06,
AUTHOR = {Cees G. M. Snoek and Marcel Worring and Jan C. van Gemert and Jan-Mark Geusebroek and Arnold W. M. Smeulders},
TITLE = {The Challenge Problem for Automated Detection of 101 Semantic Concepts in Multimedia},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Multimedia},
PAGES = {421--430},
MONTH = {October},
YEAR = {2006},
ADDRESS = {Santa Barbara, USA},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-challenge-acm2006.pdf},
ABSTRACT = {
We introduce the challenge problem for generic video indexing to
gain insight in intermediate steps that affect performance of
multimedia analysis methods, while at the same time fostering
repeatability of experiments. To arrive at a challenge problem, we
provide a general scheme for the systematic examination of automated
concept detection methods, by decomposing the generic video indexing
problem into 2 unimodal analysis experiments, 2 multimodal analysis
experiments, and 1 combined analysis experiment. For each
experiment, we evaluate generic video indexing performance on 85
hours of international broadcast news data, from the TRECVID
2005/2006 benchmark, using a lexicon of 101 semantic concepts. By
establishing a minimum performance on each experiment, the challenge
problem allows for component-based optimization of the generic
indexing issue, while simultaneously offering other researchers a
reference for comparison during indexing methodology development. To
stimulate further investigations in intermediate analysis steps that
influence video indexing performance, the challenge offers to the
research community a manually annotated concept lexicon,
pre-computed low-level multimedia features, trained classifier
models, and five experiments together with baseline performance,
which are all available at http://www.mediamill.nl/challenge/.
}
}
@INPROCEEDINGS{GemertACM06,
AUTHOR = {Jan C. van Gemert and Cees G. M. Snoek and Cor Veenman and Arnold W. M. Smeulders},
TITLE = {The Influence of Cross-Validation on Video Classification Performance},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Multimedia},
PAGES = {695--698},
MONTH = {October},
YEAR = {2006},
ADDRESS = {Santa Barbara, USA},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/gemert-crossvalidation-acm2006.pdf},
ABSTRACT = {
Digital video is sequential in nature. When video data is used in a
semantic concept classification task, the episodes are usually summarized
with shots. The shots are annotated as containing, or not containing,
a certain concept resulting in a labeled dataset. These labeled shots
can subsequently be used by supervised learning methods (classifiers)
where they are trained to predict the absence or presence of the concept
in unseen shots and episodes. The performance of such automatic
classification systems is usually estimated with cross-validation. By
taking random samples from the dataset for training and testing as such,
part of the shots from an episode are in the training set and another
part from the same episode is in the test set. Accordingly, data
dependence between training and test set is introduced, resulting in
too optimistic performance estimates. In this paper, we experimentally
show this bias, and propose how this bias can be prevented using
"episode-constrained" cross-validation. Moreover, we show that a 15\%
higher classifier performance can be achieved by using episode
constrained cross-validation for classifier parameter tuning.
}
}
@INPROCEEDINGS{GeusebroekBMVC06,
AUTHOR = {Jan-Mark Geusebroek},
TITLE = {Compact Object Descriptors from Local Colour Invariant Histograms},
BOOKTITLE = {British Machine Vision Conference},
PAGES = {},
MONTH = {September},
YEAR = {2006},
ADDRESS = {Edinburgh, UK},
PDF = {http://www.science.uva.nl/~mark/pub/2006/GeusebroekBMVC06.pdf},
ABSTRACT = {
Much emphasis has recently been placed on the detection and recognition
of locally (weak) affine invariant region descriptors for object
recognition. In this paper, we take recognition one step further by
developing features for non-planar objects. We consider the description
of objects with locally smoothly varying surface. For this class of
objects, colour invariant histogram matching has proven to be very
encouraging. However, matching many local colour cubes is
computationally demanding. We propose a compact colour descriptor,
which we call Wiccest, requiring only 12 numbers to locally capture
colour and texture information. The Wiccest features are shown to be
fairly insensitive to photometric effects like shadow, shading, and
illumination colour. Moreover, we demonstrate the features to be
applicable to highly compressed images while retaining discriminative
power.
}
}
@INPROCEEDINGS{WorringICPR06,
AUTHOR = {Marcel Worring and Cees G. M. Snoek and Ork de Rooij and Giang P. Nguyen and Dennis C. Koelma},
TITLE = {Lexicon-based Browsers for Searching in News Video Archives},
BOOKTITLE = {Proceedings of the International Conference on Pattern Recognition},
PAGES = {1256--1259},
MONTH = {August},
YEAR = 2006,
ADDRESS = {Hong Kong, China},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/worring-browsers-icpr2006.pdf},
ABSTRACT = {
In this paper we present the methods and visualizations used in the
MediaMill video search engine. The basis for the engine is a semantic
indexing process which derives a lexicon of 101 concepts. To support
the user in navigating the collection, the system defines a visual similarity
space, a semantic similarity space, a semantic thread space, and
browsers to explore them. The search system is evaluated within the
TRECVID benchmark. We obtain a top-3 result for 19 out of 24 search
topics. In addition, we obtain the highest mean average precision of
all search participants.
}
}
@INPROCEEDINGS{SnoekCIVR06,
AUTHOR = {Cees G. M. Snoek and Marcel Worring and Dennis C. Koelma and Arnold W. M. Smeulders},
TITLE = {Learned Lexicon-driven Interactive Video Retrieval},
BOOKTITLE = {Proceedings of the International Conference on Image and Video Retrieval, CIVR 2006, Tempe, Arizona, July 13-15, 2006},
EDITOR = {H. Sundaram and others},
SERIES = {LNCS},
VOLUME = {4071},
PAGES = {11--20},
PUBLISHER = {Springer-Verlag},
ADDRESS = {Heidelberg, Germany},
MONTH = {July},
YEAR = 2006,
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-lexicon-civr2006.pdf},
ABSTRACT = {
We combine in this paper automatic learning of a large lexicon of
semantic concepts with traditional video retrieval methods into a
novel approach to narrow the semantic gap. The core of the proposed
solution is formed by the automatic detection of an unprecedented
lexicon of 101 concepts. From there, we explore the combination of
query-by-concept, query-by-example, query-by-keyword, and user
interaction into the \emph{MediaMill} semantic video search engine.
We evaluate the search engine against the 2005 NIST TRECVID video
retrieval benchmark, using an international broadcast news archive
of 85 hours. Top ranking results show that the lexicon-driven search
engine is highly effective for interactive video retrieval.
}
}
@INPROCEEDINGS{SnoekICME06,
AUTHOR = {Cees G. M. Snoek and Marcel Worring and Jan-Mark Geusebroek and Dennis C. Koelma and Frank J. Seinstra and Arnold W. M. Smeulders},
TITLE = {The Semantic Pathfinder for Generic News Video Indexing},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {},
MONTH = {July},
YEAR = {2006},
ADDRESS = {Toronto, Canada},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-pathfinder-icme2006.pdf},
ABSTRACT = {
This paper presents the semantic pathfinder architecture for
generic indexing of video archives. The pathfinder automatically
extracts semantic concepts from video based on the exploration of
different paths through three consecutive analysis steps, closely
linked to the video production process, namely: content analysis,
style analysis, and context analysis. The virtue of the semantic
pathfinder is its learned ability to find a best path of analysis
steps on a per-concept basis. To show the generality of this
indexing approach we develop detectors for a lexicon of 32
concepts and we evaluate the semantic pathfinder against the 2004
NIST TRECVID video retrieval benchmark, using a news archive of 64
hours. Top ranking performance indicates the merit of the semantic
pathfinder.
}
}
@INPROCEEDINGS{GemertSLAM06,
AUTHOR = {{Jan C. van} Gemert and Jan-Mark Geusebroek and Cor J. Veenman and Cees G. M. Snoek and Arnold W. M. Smeulders},
TITLE = {Robust Scene Categorization by Learning Image Statistics in Context},
BOOKTITLE = {Int'l Workshop on Semantic Learning Applications in Multimedia, in conjunction with {CVPR'06}},
PAGES = {},
MONTH = {June},
YEAR = {2006},
ADDRESS = {New York, USA},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/gemert-scene-slam2006.pdf},
ABSTRACT = {
We present a generic and robust approach for scene categorization.
A complex scene is described by proto-concepts like vegetation,
water, fire, sky etc. These proto-concepts are represented by low
level features, where we use natural images statistics to compactly
represent color invariant texture information by a Weibull distribution.
We introduce the notion of contextures which preserve the context of
textures in a visual scene with an occurrence histogram (context) of
similarities to proto-concept descriptors (texture). In contrast to
a codebook approach, we use the similarity to all vocabulary elements
to generalize beyond the code words. Visual descriptors are attained
by combining different types of contexts with different texture
parameters. The visual scene descriptors are generalized to visual
categories by training a support vector machine. We evaluate our
approach on 3 different datasets: 1) 50 categories for the TRECVID
video dataset; 2) the Caltech 101-object images; 3) 89 categories
being the intersection of the Corel photo stock with the Art Explosion
photo stock. Results show that our approach is robust over different
datasets, while maintaining competitive performance.
}
}
@INPROCEEDINGS{SmeuldersISCCSP06,
AUTHOR = {Arnold W. M. Smeulders and Jan van Gemert and Jan-Mark Geusebroek and Cees Snoek and Marcel Worring},
TITLE = {Browsing for the National Dutch Video Archive},
BOOKTITLE = {ISCCSP2006},
PAGES = {},
MONTH = {March},
YEAR = {2006},
ADDRESS = {Marrakech, Morocco},
PDF = {http://www.science.uva.nl/~smeulder/pubs/ISCCSP2006SmeuldersTEMP.pdf},
ABSTRACT = {
Pictures have always been a prime carrier of Dutch culture. But pictures
take a new form. We live in times of broad- and narrowcasting through
Internet, of passive and active viewers, of direct or delayed broadcast,
and of digital pictures being delivered in the museum or at home. At the
same time, the picture and television archives turn digital. Archives are
going to be swamped with information requests unless they swiftly adapt
to partially automatic annotation and digital retrieval. Our aim is to
provide faster and more complete access to picture archives by digital
analysis. Our approach consists of a multi-media analysis of features of
pictures in tandem with the language that describes those pictures, under
the guidance of a visual ontology. The general scientific paradigm we
address is the detection of directly observables fused into semantic
features learned from large repositories of digital video. We use
invariant, natural-image statisticsbased contextual feature sets for
capturing the concepts of images and integrate that as early as possible
with text. The system consists of a large for science yet small for
practice set of visual concepts permitting the retrieval of semantically
formulated queries. We will demonstrate a PC-based, off-line trained state
of the art system for browsing broadcast news-archives.
}
}
@INPROCEEDINGS{NguyenMIR05,
AUTHOR = {Giang P. Nguyen and Marcel Worring},
TITLE = {Scenario optimization for interactive category search},
BOOKTITLE = {Proceedings of the {ACM} {SIGMM} International Workshop on Multimedia Information Retrieval},
PAGES = {},
MONTH = {November},
YEAR = {2005},
ADDRESS = {Singapore},
PDF = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2005/giangnpMIR05.pdf},
ABSTRACT = {
Most of the existing work in interactive content based retrieval
concentrates on machine learning methods for effective use of
relevance feedback. On the other end of the spectrum, the
information visualization community focusses on effective methods
for conveying information to the user. What lacks is research
considering the information visualization and interactive content
based retrieval as truly integrated parts of one search system. In
such an integrated system there are many degrees of freedom like
the number of images to display, the image size, different
visualization modes, and possible feedback modes. To find optimal
values for all of those using user studies is unfeasible. We
therefore develop scenarios in which tasks and user actions are
simulated. These are then optimized based on objective constraints
and evaluation criteria. In such a manner the degrees of freedom
are reduced and the remaining degrees can be evaluated in user
studies. In this paper we present a system which integrates
advanced similarity based visualization with active learning. We
have performed extensive scenario based experimentation on an
interactive category search task. The results show that indeed the
use of advanced visualization and active learning pays off.
}
}
@INPROCEEDINGS{HollinkACM05,
AUTHOR = {Laura Hollink and Marcel Worring and Guus Schreiber},
TITLE = {Building a Visual Ontology for Video Retrieval},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Multimedia},
PAGES = {479--482},
MONTH = {November},
YEAR = {2005},
ADDRESS = {Singapore},
PDF = {http://www.cs.vu.nl/~guus/papers/Hollink05b.pdf},
ABSTRACT = {
To ensure access to growing video collections, annotation is becoming more
and more important. Using background knowledge in the form of ontologies
or thesauri is a way to facilitate annotation in a broad domain. Current
ontologies are not suitable for (semi-) automatic annotation of visual
resources as they contain little visual information about the concepts
they describe. We investigate how an ontology that does contain visual
information can facilitate annotation in a broad domain and identify
requirements that a visual ontology has to meet. Based on these
requirements, we create a visual ontology out of two existing knowledge
corpora (WordNet and MPEG-7) by creating links between visualand general
concepts. We test performance of the ontology on 40 shots of news video,
and discuss the added value of each visual property.
}
}
@INPROCEEDINGS{SnoekACM05a,
AUTHOR = {Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
TITLE = {Early versus Late Fusion in Semantic Video Analysis},
BOOKTITLE = {Proceedings of the {ACM} International Conference on Multimedia},
PAGES = {399--402},
MONTH = {November},
YEAR = {2005},
ADDRESS = {Singapore},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-earlylate-acm2005.pdf},
ABSTRACT = {
Semantic analysis of multimodal video aims to index segments of interest at a
conceptual level. In reaching this goal, it requires an analysis of several
information streams. At some point in the analysis these streams need to be
fused. In this paper, we consider two classes of fusion schemes, namely early
fusion and late fusion. The former fuses modalities in feature space, the
latter fuses modalities in semantic space. We show by experiment on 184 hours
of broadcast video data and for 20 semantic concepts, that late fusion tends
to give slightly better performance for most concepts. However, for those
concepts where early fusion performs better the difference is more significant.
}
}
@INPROCEEDINGS{SnoekICME05a,
AUTHOR = {Cees G. M. Snoek and Marcel Worring and Jan-Mark Geusebroek and Dennis C. Koelma and Frank J. Seinstra},
TITLE = {On the Surplus Value of Semantic Video Analysis Beyond the Key Frame},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {},
MONTH = {July},
YEAR = {2005},
ADDRESS = {Amsterdam, The Netherlands},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-surplus-icme2005.pdf},
ABSTRACT = {
Typical semantic video analysis methods aim for classification of camera shots
based on extracted features from a single key frame only. In this paper, we
sketch a video analysis scenario and evaluate the benefit of analysis beyond
the key frame for semantic concept detection performance. We developed
detectors for a lexicon of 26 concepts, and evaluated their performance on
120 hours of video data. Results show that, on average, detection performance
can increase with almost 40\% when the analysis method takes more visual
content into account.
}
}
@INPROCEEDINGS{SnoekGFKL05,
AUTHOR = {Cees G. M. Snoek and Marcel Worring},
TITLE = {Multimedia Pattern Recognition in Soccer Video using Time Intervals},
BOOKTITLE = {Classification the Ubiquitous Challenge, Proceedings of the 28th Annual Conference of the Gesellschaft fur Klassifikation e.V., University of Dortmund, March 9-11, 2004},
PUBLISHER = {Springer-Verlag},
SERIES = {Studies in Classification, Data Analysis, and Knowledge Organization},
EDITORS = {C. Weihs and W. Gaul},
PAGES = {97--108},
YEAR = {2005},
ADDRESS = {Berlin, Germany},
PDF = {},
NOTE = {\emph{Invited paper}},
ABSTRACT = {
We focus on the problem of learning rich semantic patterns from the multimedia
data associated with broadcast video documents. In this talk we propose a generic
and flexible framework for produced video classification that is capable to learn
semantic concepts from multimodal sources based on analyzed style elements. Four
properties that are indicative for style are identified, i.e. layout, content,
capture, and concept context. The framework allows for robust classification of
different semantic concepts in produced video by using a fixed core of common
layout, content, and capture elements in combination with varying concept specific
context elements. Concepts are classified using a Stacked Probabilistic Support
Vector Machine. Results on 120 hours of video data from the 2003 TRECVID benchmark
show that, by using the proposed framework, several rich semantic concepts in
broadcast news can be classified with state-of-the-art accuracy.
}
}
@INPROCEEDINGS{NguyenDELOS05,
AUTHOR = {Giang P. Nguyen and Marcel Worring},
TITLE = {Similarity based visualization of image collections},
BOOKTITLE = {Proceedings of the 7th International Workshop of the EU Network of Excellence DELOS on Audio-visual Content and Information Visualization in Digital Libraries},
PAGES = {},
MONTH = {May},
YEAR = {2005},
ADDRESS = {Cortona, Italy},
PDF = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2005/giangnpAVIVDiLib05.pdf},
ABSTRACT = {
In literature, few content based multimedia retrieval systems take
the visualization as a tool for exploring the collections.
However, when searching for images without examples to start with,
one needs to explore the data set. Up to now, most available
systems just show random collections of images in 2D grid form.
More recently, advanced techniques have been developed for
browsing based on similarity. However, none of them analyze the
problems that occur when visualizing large visual collections. In
this paper, we make these problems explicit. From there, we
establish three general requirements: overview, visibility, and
data structure preservation. Solutions for each requirement are
proposed. Finally, a system is presented and experimental results
are given to demonstrate our theory and approach.
}
}
@INPROCEEDINGS{SeinstraIPDPS05,
AUTHOR = {Frank J. Seinstra and Cees G. M. Snoek and Dennis C. Koelma and Jan-Mark Geusebroek and Marcel Worring},
TITLE = {User Transparent Parallel Processing of the 2004 {NIST} {TRECVID} Data Set},
BOOKTITLE = {Proceedings of the 19th International Parallel \& Distributed Processing Symposium},
PAGES = {},
MONTH = {April},
YEAR = {2005},
ADDRESS = {Denver, USA},
PDF = {http://staff.science.uva.nl/~fjseins/Papers/Conferences/ipdps2005.pdf},
ABSTRACT = {
The Parallel-Horus framework, developed at the University of Amsterdam, is a
unique software architecture that allows non-expert parallel programmers to
develop fully sequential multimedia applications for efficient execution on
homogeneous Beowulf-type commodity clusters. Previously obtained results for
realistic, but relatively small-sized applications have shown the feasibility
of the Parallel-Horus approach, with parallel performance consistently being
found to be optimal with respect to the abstraction level of message passing
programs. In this paper we discuss the most serious challenge Parallel-Horus
has had to deal with so far: the processing of over 184 hours of video included
in the 2004 NIST TRECVID evaluation, i.e. the de facto international standard
benchmark for content-based video retrieval. Our results and experiences
confirm that Parallel- Horus is a very powerful support-tool for state-of-the-art
research and applications in multimedia processing.
}
}
@INPROCEEDINGS{HollinkKMSA04,
AUTHOR = {Laura Hollink and Giang Nguyen and Guus Schreiber and Jan Wielemaker and Bob Wielinga and Marcel Worring},
TITLE = {Adding Spatial Semantics to Image Annotations},
BOOKTITLE = {International Workshop on Knowledge Markup and Semantic Annotation},
ADDRESS = {Hiroshima, Japan},
MONTH = {November},
YEAR = {2004},
PDF = {http://www.cs.vu.nl/~guus/papers/Hollink04c.pdf},
ABSTRACT = {
In this paper we discuss a the support of users in adding spatial
information semi-automatically to annotations of images. Descriptions of
objects depicted in an image are extended with information about the
position of those objects. We distinguish two types of spatial concepts:
absolute positions of objects (e.g., east, west) and relative spatial
relations between objects (e.g., left, above). We show the use of a tool
for a collection of art paintings with preexisting RDF annotations,
including a list of image objects. First, the tool segments a painting
into regions. The user selects regions, and labels these with objects from
the existing annotation. Then, the tool computes absolute positions and
relative spatial relations of the selected regions, and adds these to the
annotation. A small evaluation study is reported in which annotations
generated by the tool are compared to manual annotations by ten
volunteers.
}
}
@INPROCEEDINGS{HollinkCIVR04,
AUTHOR = {Laura Hollink and Giang P.Nguyen and Dennis Koelma and Guus Schreiber and M.Worring},
TITLE = {User Strategies in Video Retrieval: a Case Study},
BOOKTITLE = {Proceedings of the International Conference on Image and Video Retrieval, CIVR 2004, Dublin, Ireland, July 21-23, 2004},
EDITOR = {P. Enser and Y. Kompatsiaris and N.E. O'Connor and A.F. Smeaton and A.W. M. Smeulders},
SERIES = {LNCS},
VOLUME = 3115,
PAGES = {6-14},
PUBLISHER = {Springer-Verlag},
ADDRESS = {Heidelberg, Germany},
YEAR = 2004,
PDF = {http://www.cs.vu.nl/~guus/papers/Hollink04b.pdf},
ABSTRACT = {
In this paper we present the results of a user study that was conducted in
combination with a submission to TRECVID 2003. Search behavior of students
querying an interactive video-retrieval system was analyzed. 242 Searches
by 39 students on 24 topics were assessed. Questionnaire data, logged user
actions on the system, and a quality mea- sure of each search provided by
TRECVID were studied. Analysis of the results at various stages in the
retrieval process suggests that retrieval based on transcriptions of the
speech in video data adds more to the average precision of the result than
content-based retrieval. The latter is particularly useful in providing
the user with an overview of the dataset and thus an indication of the
success of a search.
}
}
@INPROCEEDINGS{NguyenICME04a,
AUTHOR = {Giang P. Nguyen and Marcel Worring},
TITLE = {A user based framework for salient detail extraction},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {},
MONTH = {June},
YEAR = {2004},
ADDRESS = {Taipei, Taiwan},
PDF = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2004/giangnpICME04a.pdf},
ABSTRACT = {
In this paper, we consider the interaction with salient details in
the image i.e. points, lines, and regions. Interactive salient
detail definition goes further than summarizing the image into a
set of salient details since the saliency of details depends on
the context, the application and the user. We propose an
interaction framework for salient details from the perspective of
the user, which dynamically updates the user- and context-dependent
definition of saliency based on relevance feedback. A number of
instantiations of the framework are presented.
}
}
@INPROCEEDINGS{NguyenICME04b,
AUTHOR = {Giang P. Nguyen and Marcel Worring},
TITLE = {Optimizing similarity based visualization in content based image retrieval},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {},
MONTH = {June},
YEAR = {2004},
ADDRESS = {Taipei, Taiwan},
PDF = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2004/giangnpICME04b.pdf},
ABSTRACT = {
In any CBIR system, visualization is important, either to show the
final result to the user or to form the basis for interaction.
Advanced systems use 2-dimensional similarity based visualization
which show not only the information of one image itself but also
the relations between images. A problem in interactive 2D
visualization is the overlap between the images displayed. This
obviously reduces the search capability. Simply spreading the
images on the screen space will not preserve the relations between
them. In this paper, we propose a visualization scheme which
reduces the overlap as well as preserves the general distribution
of the images displayed. Results show that an effective balance
between display of structures and limited overlap can be achieved.
}
}
@INPROCEEDINGS{SnoekICME04,
AUTHOR = {Cees G. M. Snoek and Marcel Worring and Alexander G. Hauptmann},
TITLE = {Detection of {TV} News Monologues by Style Analysis},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {},
MONTH = {June},
YEAR = {2004},
ADDRESS = {Taipei, Taiwan},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-style-icme2004.pdf},
ABSTRACT = {
We propose a method for detection of semantic concepts in produced video
based on style analysis. Recognition of concepts is done by applying a
classifier ensemble to the detected style elements. As a case study we
present a method for detecting the concept of news subject monologues. Our
approach had the best average precision performance amongst 26 submissions
in the 2003 TRECVID benchmark.
}
}
@INPROCEEDINGS{WorringICME04,
AUTHOR = {Marcel Worring and Giang P. Nguyen and Laura Hollink and Jan C. van Gemert and Dennis C. Koelma},
TITLE = {Accessing Video Archives Using Interactive Search},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {},
MONTH = {June},
YEAR = {2004},
ADDRESS = {Taipei, Taiwan},
PDF = {http://www.cs.vu.nl/~laurah/1/papers/Worring04_trec.pdf},
ABSTRACT = {
In this presentation we present a system for interactive search
in video archives. In our view interactive search is a fourstep
process composed of indexing, filtering, browsing, and
ranking. We have experimentally verified, using 22 groups
of two participants each, how users apply these steps in the
interactive search and how well they perform.
}
}
@INPROCEEDINGS{NguyenMIR03,
AUTHOR = {Giang P. Nguyen and Marcel Worring},
TITLE = {Query Definition using Interactive Saliency},
BOOKTITLE = {Proceedings of the {ACM} {SIGMM} International Workshop on Multimedia Information Retrieval},
PAGES = {},
MONTH = {November},
YEAR = {2003},
ADDRESS = {Berkeley, USA},
PDF = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2003/giangnpMIR03.pdf},
ABSTRACT = {
Content-based image retrieval (CBIR) has been under investigation
for a long time with many systems built to meet different
application demands. However, in all systems, there is still a big
gap between the user's expectation and the system's retrieval
capabilities. Therefore, user interaction is an essential
component of any CBIR system. Interaction up to now has mostly
focused on global image features or similarities. We consider the
interaction with salient details in the image i.e. points, lines,
and regions. Interactive salient detail definition goes further
than automatically summarizing the image into a set of salient
details. We aim to dynamically update the user- and
context-dependent definition of saliency based on relevance
feedback from the user. In this paper, we propose an interaction
framework for salient details from the perspective of the user.
}
}
@INPROCEEDINGS{SnoekICME03a,
AUTHOR = {Cees G. M. Snoek and Marcel Worring},
TITLE = {Time Interval Maximum Entropy based Event Indexing in Soccer Video},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
PAGES = {481--484},
MONTH = {July},
YEAR = {2003},
ADDRESS = {Baltimore, USA},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/icme2003.pdf},
ABSTRACT = {
Multimodal indexing of events in video documents poses problems with respect
to representation, inclusion of contextual information, and synchronization
of the heterogeneous information sources involved. In this paper we present
the Time Interval Maximum Entropy (TIME) framework that tackles aforementioned
problems. To demonstrate the viability of TIME for event classification in
multimodal video, an evaluation was performed on the domain of soccer broadcasts.
It was found that by applying TIME, the amount of video a user has to watch in
order to see almost all highlights can be reduced considerably.
}
}
@INPROCEEDINGS{WorringSOFSEM02,
AUTHOR = {Marcel Worring and Andrew Bagdanov and Jan van Gemert and Jan-Mark Geusebroek and Minh Hoang and Guus Schreiber and Cees G. M. Snoek and Jeroen Vendrig and Jan Wielemaker and Arnold W. M. Smeulders},
TITLE = {Interactive Indexing and Retrieval of Multimedia Content},
BOOKTITLE = {Proceedings of the 29th Annual Conference on Current Trends in Theory and Practice of Informatics},
SERIES = {Lecture Notes in Computer Science},
VOLUME = {2540},
PAGES = {135-148},
PUBLISHER = {Springer-Verlag},
YEAR = {2002},
ADDRESS = {Milovy, Czech Republic},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/sofsem2002.pdf},
ABSTRACT = {
The indexing and retrieval of multimedia items is difficult due to the semantic
gap between the user's perception of the data and the descriptions we can derive
automatically from the data using computer vision, speech recognition, and
natural language processing. In this contribution we consider the nature of
the semantic gap in more detail and show examples of methods that help in
limiting the gap. These methods can be automatic, but in general the indexing
and retrieval of multimedia items should be a collaborative process between the
system and the user. We show how to employ the user's interaction for limiting
the semantic gap.
}
}
@INPROCEEDINGS{SnoekICME02,
AUTHOR = {Cees G. M. Snoek and Marcel Worring},
TITLE = {A Review on Multimodal Video Indexing},
BOOKTITLE = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
VOLUME = {2},
PAGES = {21--24},
MONTH = {August},
YEAR = {2002},
ADDRESS = {Lausanne, Switzerland},
PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/icme2002.pdf},
ABSTRACT = {
Efficient and effective handling of video documents depends on the availability
of indexes. Manual indexing is unfeasible for large video collections. Efficient,
single modality based, video indexing methods have appeared in literature.
Effective indexing, however, requires a multimodal approach in which either the
most appropriate modality is selected or the different modalities are used in
collaborative fashion. In this paper we present a framework for multimodal video
indexing, which views a video document from the perspective of its author. The
framework serves as a blueprint for a generic and flexible multimodal video
indexing system, and generalizes different state-of-the-art video indexing
methods. It furthermore forms the basis for categorizing these different
methods.
}
}