articles.bib

@COMMENT{{{This file has been generated by bib2bib 1.74}}
@COMMENT{{{Command line: bib2bib-1.74.exe -ob articles.bib -c $type='ARTICLE' -s year -r mediamill.bib}}

@ARTICLE{SandeTMM11,
  AUTHOR = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
  TITLE = {Empowering Visual Categorization with the {GPU}},
  JOURNAL = {{IEEE} Transactions on Multimedia},
  PAGES = {},
  MONTH = {},
  YEAR = {2011},
  VOLUME = {},
  NUMBER = {},
  PDF = {},
  NOTE = {In press},
  ABSTRACT = {

                }
}

@ARTICLE{SandePAMI10,
  AUTHOR = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
  TITLE = {Evaluating Color Descriptors for Object and Scene Recognition},
  JOURNAL = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
  PAGES = {1582--1596},
  MONTH = {September},
  YEAR = {2010},
  VOLUME = {32},
  NUMBER = {9},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/sande-colordescriptors-pami.pdf},
  SOFTWARE = {http://www.colordescriptors.com},
  ABSTRACT = {
  		 
  		  Image category recognition is important to access visual information 
  		  on the level of objects and scene types. So far, intensity-based 
  		  descriptors have been widely used for feature extraction at salient 
  		  points. To increase illumination invariance and discriminative power, 
  		  color descriptors have been proposed. Because many different descriptors 
  		  exist, a structured overview is required of color invariant descriptors 
  		  in the context of image category recognition. Therefore, this paper 
  		  studies the invariance properties and the distinctiveness of color 
  		  descriptors in a structured way. The analytical invariance properties 
  		  of color descriptors are explored, using a taxonomy based on invariance 
  		  properties with respect to photometric transformations, and tested 
  		  experimentally using a dataset with known illumination conditions. In 
  		  addition, the distinctiveness of color descriptors is assessed 
  		  experimentally using two benchmarks, one from the image domain and one 
  		  from the video domain. From the theoretical and experimental results, 
  		  it can be derived that invariance to light intensity changes and light 
  		  color changes affects category recognition. The results reveal further 
  		  that, for light intensity changes, the usefulness of invariance is 
  		  category-specific. Overall, when choosing a single descriptor and no 
  		  prior knowledge about the dataset and object and scene categories is 
  		  available, the OpponentSIFT is recommended. Furthermore, a combined set 
  		  of color descriptors outperforms intensity-based SIFT and improves 
  		  category recognition by 8\% on the PASCAL VOC 2007 and by 7\% on the 
  		  MediaMill Challenge.
    		
  		}
}

@ARTICLE{ByrneMMTA10,
  AUTHOR = {Daragh Byrne and Aiden R. Doherty and Cees G. M. Snoek and Gareth J. F. Jones and Alan F. Smeaton},
  TITLE = {Everyday Concept Detection in Visual Lifelogs: Validation, Relationships and Trends},
  JOURNAL = {Multimedia Tools and Applications},
  PAGES = {119--144},
  MONTH = {August},
  YEAR = {2010},
  VOLUME = {49},
  NUMBER = {1},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/byrne-everyday-concept-detection-mmta.pdf},
  ABSTRACT = {
  
  		 The Microsoft SenseCam is a small lightweight wearable camera used to
		 passively capture photos and other sensor readings from a user’s day-to-day activities.
		 It captures on average 3,000 images in a typical day, equating to almost 1 million
		 images per year. It can be used to aid memory by creating a personal multimedia
		 lifelog, or visual recording of the wearer’s life. However the sheer volume of image
		 data captured within a visual lifelog creates a number of challenges, particularly for
		 locating relevant content. Within this work, we explore the applicability of semantic
		 concept detection, a method often used within video retrieval, on the domain of
		 visual lifelogs. Our concept detector models the correspondence between low-level
		 visual features and high-level semantic concepts (such as indoors, outdoors, people,
		 buildings, etc.) using supervised machine learning. By doing so it determines the
		 probability of a concept’s presence. We apply detection of 27 everyday semantic
		 concepts on a lifelog collection composed of 257,518 SenseCam images from 5
		 users. The results were evaluated on a subset of 95,907 images, to determine the
		 accuracy for detection of each semantic concept. We conducted further analysis
		 on the temporal consistency, co-occurance and relationships within the detected
		 concepts to more extensively investigate the robustness of the detectors within this
		 domain.
  
  		}
}

@ARTICLE{SnoekCOM10,
  AUTHOR = {Cees G. M. Snoek and Arnold W. M. Smeulders},
  TITLE = {Visual-Concept Search Solved?},
  JOURNAL = {{IEEE} Computer},
  PAGES = {76--78},
  MONTH = {June},
  YEAR = {2010},
  VOLUME = {43},
  NUMBER = {6},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-smeulders-solved-computer.pdf},
  ABSTRACT = {
  		 
  		 Progress in visual-concept search suggests that machine understanding of images 
  		 is within reach.
    		
  		}
}

@ARTICLE{RooijCGA10,
  AUTHOR = {Ork de Rooij and Marcel Worring and Jack J. van Wijk},
  TITLE = {MediaTable: Interactive Categorization of Multimedia Collections},
  JOURNAL = {IEEE Computer Graphics and Applications},
  PAGES = {42--51},
  MONTH = {May},
  YEAR = {2010},
  VOLUME = {30},
  NUMBER = {5},
  PDF = {http://www.science.uva.nl/research/publications/2010/deRooijCGA2010},
  ABSTRACT = {
  
    		
  		}
}

@ARTICLE{GemertCVIU10,
  AUTHOR = {Jan C. van Gemert and Cees G. M. Snoek and Cor J. Veenman and Arnold W. M. Smeulders and Jan-Mark Geusebroek},
  TITLE = {Comparing Compact Codebooks for Visual Categorization},
  JOURNAL = {Computer Vision and Image Understanding},
  PAGES = {450--462},
  MONTH = {April},
  YEAR = {2010},
  VOLUME = {114},
  NUMBER = {4},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/gemert-compact-codebooks-cviu.pdf},
  ABSTRACT = {
  		 
  		 In the face of current large-scale video libraries, the practical applicability of 
  		 content-based indexing algorithms is constrained by their efficiency. This paper 
  		 strives for efficient large-scale video indexing by comparing various visual-based 
  		 concept categorization techniques. In visual categorization, the popular codebook 
  		 model has shown excellent categorization performance. The codebook model represents
		 continuous visual features by discrete prototypes predefined in a vocabulary. The 
		 vocabulary size has a major impact on categorization efficiency, where a more compact 
		 vocabulary is more efficient. However, smaller vocabularies typically score lower on 
		 classification performance than larger vocabularies. This paper compares four approaches 
		 to achieve a compact codebook vocabulary while retaining categorization performance. 
		 For these four methods, we investigate the trade-off between codebook compactness
		 and categorization performance. We evaluate the methods on more than 200 h of challenging 
		 video data with as many as 101 semantic concepts. The results allow us to create a 
		 taxonomy of the four methods based on their efficiency and categorization performance.		  
    		
  		}
}

@ARTICLE{RooijTMM10,
  AUTHOR = {Ork de Rooij and Marcel Worring},
  TITLE = {Browsing Video Along Multiple Threads},
  JOURNAL = {{IEEE} Transactions on Multimedia},
  PAGES = {121--130},
  MONTH = {February},
  YEAR = {2010},
  VOLUME = {12},
  NUMBER = {2},
  PDF = {http://www.science.uva.nl/research/publications/2010/deRooijITM2010},
  ABSTRACT = {
  
    		
  		}
}

@ARTICLE{LiTMM09,
  AUTHOR = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
  TITLE = {Learning Social Tag Relevance by Neighbor Voting},
  JOURNAL = {{IEEE} Transactions on Multimedia},
  PAGES = {1310--1322},
  MONTH = {November},
  YEAR = {2009},
  VOLUME = {11},
  NUMBER = {7},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/li-socialtagrelevance-tmm.pdf},
  ABSTRACT = {
  
  		  Social image analysis and retrieval is important
		  for helping people organize and access the increasing amount
		  of user-tagged multimedia. Since user tagging is known to be
		  uncontrolled, ambiguous, and overly personalized, a fundamental
		  problem is how to interpret the relevance of a user-contributed
		  tag with respect to the visual content the tag is describing.
		  Intuitively, if different persons label visually similar images using
		  the same tags, these tags are likely to reflect objective aspects
		  of the visual content. Starting from this intuition, we propose
		  in this paper a neighbor voting algorithm which accurately and
		  efficiently learns tag relevance by accumulating votes from visual
		  neighbors. Under a set of well defined and realistic assumptions,
		  we prove that our algorithm is a good tag relevance measurement
		  for both image ranking and tag ranking. Three experiments on
		  3.5 million Flickr photos demonstrate the general applicability
		  of our algorithm in both social image retrieval and image tag
		  suggestion. Our tag relevance learning algorithm substantially
		  improves upon baselines for all the experiments. The results
		  suggest that the proposed algorithm is promising for real-world
		  applications.
    		
  		}
}

@ARTICLE{SnoekFNTIR09,
  AUTHOR = {Cees G. M. Snoek and Marcel Worring},
  TITLE = {Concept-Based Video Retrieval},
  JOURNAL = {Foundations and Trends in Information Retrieval},
  PAGES = {215--322},
  MONTH = {},
  YEAR = {2009},
  VOLUME = {4},
  NUMBER = {2},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-concept-based-video-retrieval-fntir.pdf},
  ABSTRACT = {
  		 
  		 In this paper, we review 300 references on video retrieval, indicating
		 when text-only solutions are unsatisfactory and showing the promising
		 alternatives which are in majority concept-based. Therefore, central
		 to our discussion is the notion of a semantic concept: an objective
		 linguistic description of an observable entity. Specifically, we present
		 our view on how its automated detection, selection under uncertainty,
		 and interactive usage might solve the major scientific problem for video
		 retrieval: the semantic gap. To bridge the gap, we lay down the anatomy
		 of a concept-based video search engine. We present a component-wise
		 decomposition of such an interdisciplinary multimedia system, covering
		 influences from information retrieval, computer vision, machine learning,
		 and human-computer interaction. For each of the components we
		 review state-of-the-art solutions in the literature, each having different
		 characteristics and merits. Because of these differences, we cannot
		 understand the progress in video retrieval without serious evaluation
  		 efforts such as carried out in the NIST TRECVID benchmark. We
		 discuss its data, tasks, results, and the many derived community
		 initiatives in creating annotations and baselines for repeatable experiments.
		 We conclude with our perspective on future challenges and
		 opportunities.
  		   		 		     		
  		}
}

@ARTICLE{SmeatonIJIST08,
  AUTHOR = {Alan F. Smeaton and Peter Wilkins and Marcel Worring and Ork de Rooij and Tat-Seng Chua and Huanbo Luan},
  TITLE = {Content-based Video Retrieval: Three Example Systems from {TRECVid}},
  JOURNAL = {International Journal of Imaging Systems and Technology},
  MONTH = {},
  YEAR = {2008},
  VOLUME = {18},
  NUMBER = {2--3},
  PAGES = {195--201},
  PDF = {},
  ABSTRACT = {
           		 
		}
}

@ARTICLE{NguyenJVLC08,
  AUTHOR = {Giang P. Nguyen and Marcel Worring},
  TITLE = {Interactive Access to Large Image Collections using Similarity-based Visualization},
  JOURNAL = {Journal of Visual Languages and Computing},
  MONTH = {April},
  YEAR = {2008},
  VOLUME = {19},
  NUMBER = {2},
  PAGES = {203--224},
  PDF = {http://www.science.uva.nl/research/mediamill/pub/nguyen-similarity-visualization-jvlc.pdf},
  ABSTRACT = {
                 
                 Image collections are getting larger and larger. To access those 
                 collections, systems for managing, searching, and browsing are 
                 necessary. Visualization plays an essential role in such systems. 
                 Existing visualization systems do not analyze all the problems 
                 occurring when dealing with large visual collections. In this 
                 paper, we make these problems explicit. From there, we establish 
                 three general requirements: overview, visibility, and structure 
                 preservation. Solutions for each requirement are proposed, as well 
                 as functions balancing the different requirements. We present an 
                 optimal visualization scheme, supporting users in interacting with 
                 large image collections. Experimental results with a collection of 
                 10,000 Corel images, using simulated user actions, show that the 
                 proposed scheme significantly improves performance for a given 
                 task compared to the 2D grid-based visualizations commonly used in 
                 content-based image retrieval.
                                   		 
		}
}

@ARTICLE{SnoekMM08,
  AUTHOR = {Cees G. M. Snoek and Marcel Worring and Ork de Rooij and Koen E. A. {van de Sande} and Rong Yan and Alexander G. Hauptmann},
  TITLE = {{VideOlympics}: Real-Time Evaluation of Multimedia Retrieval Systems},
  JOURNAL = {{IEEE} MultiMedia},
  PAGES = {86--91},
  MONTH = {January--March},
  YEAR = {2008},
  VOLUME = {15},
  NUMBER = {1},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-videolympics-mm.pdf},
  ABSTRACT = {
  		  
  		 Video search is an experience for the senses. As a result, traditional 
  		 information retrieval metrics can't fully measure the quality of a video 
  		 search system. To provide a more interactive assessment of today's video 
  		 search engines, the authors have organized the VideOlympics as a real-time 
  		 evaluation showcase where systems compete to answer specific video searches 
  		 in front of a live audience. At VideOlympics, seeing and hearing is believing.
  		}
}

@ARTICLE{NguyenTOMCCAP08,
  AUTHOR = {Giang P. Nguyen and Marcel Worring},
  TITLE = {Optimization of Interactive Visual-Similarity-Based Search},
  JOURNAL = {{ACM} Transactions on Multimedia Computing, Communications and Applications},
  MONTH = {January},
  YEAR = {2008},
  VOLUME = {4},
  NUMBER = {1},
  PAGES = {7:1--23},
  PDF = {http://www.science.uva.nl/research/mediamill/pub/nguyen-optimization-tomccap.pdf},
  ABSTRACT = {
  		 
		 At one end of the spectrum, research in interactive content-based 
		 retrieval concentrates on machine learning methods for effective 
		 use of relevance feedback. On the other end, the information 
		 visualization community focuses on effective methods for conveying 
		 information to the user. What is lacking is research considering 
		 the information visualization and interactive retrieval as truly 
		 integrated parts of one content-based search system. In such an 
		 integrated system, there are many degrees of freedom like the 
		 similarity function, the number of images to display, the image 
		 size, different visualization modes, and possible feedback modes. 
		 To base the optimal values for all of those on user studies is 
		 unfeasible. We therefore develop search scenarios in which tasks 
		 and user actions are simulated. From there, the proposed scheme is 
		 optimized based on objective constraints and evaluation criteria. 
		 In such a manner, the degrees of freedom are reduced and the 
		 remaining degrees can be evaluated in user studies. In this article, 
		 we present a system that integrates advanced similarity based 
		 visualization with active learning. We have performed extensive 
		 experimentation on interactive category search with different 
		 image collections. The results using the proposed simulation 
		 scheme show that indeed the use of advanced visualization and 
		 active learning pays off in all of these datasets.
		 
		}
}

@ARTICLE{NguyenTMM07,
  AUTHOR = {Giang P. Nguyen and Marcel Worring and Arnold W. M. Smeulders},
  TITLE = {Interactive Search by Direct Manipulation of Dissimilarity Space},
  JOURNAL = {{IEEE} Transactions on Multimedia},
  MONTH = {November},
  YEAR = {2007},
  VOLUME = {9},
  NUMBER = {7},
  PAGES = {1404--1415},
  PDF = {http://www.science.uva.nl/research/mediamill/pub/nguyen-dissimilarity-tmm.pdf},
  ABSTRACT = {
  
		 In this paper, we argue to learn dissimilarity for interactive search in 
		 content based image retrieval. In literature, dissimilarity is often learned 
		 via the feature space by feature selection, feature weighting or by adjusting 
		 the parameters of a function of the features. Other than existing techniques, 
		 we use feedback to adjust the dissimilarity space independent of feature space. 
		 This has the great advantage that it manipulates dissimilarity directly. To 
		 create a dissimilarity space, we use the method proposed by Pekalska and Duin, 
		 selecting a set of images called prototypes and computing distances to those 
		 prototypes for all images in the collection. After the user gives feedback, 
		 we apply active learning with a one-class support vector machine to decide the 
		 movement of images such that relevant images stay close together while irrelevant 
		 ones are pushed away (the work of Guo ). The dissimilarity space is then adjusted 
		 accordingly. Results on a Corel dataset of 10000 images and a TrecVid collection 
		 of 43907 keyframes show that our proposed approach is not only intuitive, it 
		 also significantly improves the retrieval performance. 
		  		 
		}
}

@ARTICLE{SeinstraMM07,
  AUTHOR = {Frank J. Seinstra and Jan-Mark Geusebroek and Dennis Koelma and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
  TITLE = {High-Performance Distributed Image and Video Content Analysis with Parallel-Horus},
  JOURNAL = {{IEEE} MultiMedia},
  PAGES = {64--75},
  MONTH = {October--December},
  YEAR = {2007},
  VOLUME = {14},
  NUMBER = {4},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/seinstra-parallel-horus-mm.pdf},
  ABSTRACT = {
  		  
  		 As the world uses more digital video that requires greater storage space, 
  		 Grid computing is becoming indispensable for urgent problems in multimedia 
  		 content analysis. Parallel-Horus, a support tool for applications in multimedia 
  		 Grid computing, lets users implement multimedia applications as sequential 
  		 programs for efficient execution on clusters and Grids, based on wide-area 
  		 multimedia services.  
    		
  		}
}

@ARTICLE{SnoekTMM07b,
  AUTHOR = {Cees G. M. Snoek and Bouke Huurnink and Laura Hollink and Maarten de Rijke and Guus Schreiber and Marcel Worring},
  TITLE = {Adding Semantics to Detectors for Video Retrieval},
  JOURNAL = {{IEEE} Transactions on Multimedia},
  MONTH = {August},
  YEAR = {2007},
  VOLUME = {9},
  NUMBER = {5},
  PAGES = {975--986},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-semantics2detectors-tmm.pdf},
  ABSTRACT = {
  		 
  		 In this paper, we propose an automatic video retrieval method based on high-level 
  		 concept detectors. Research in video analysis has reached the point where over 100 
  		 concept detectors can be learned in a generic fashion, albeit with mixed performance. 
  		 Such a set of detectors is very small still compared to ontologies aiming to capture 
  		 the full vocabulary a user has. We aim to throw a bridge between the two fields by 
  		 building a multimedia thesaurus, i.e., a set of machine learned concept detectors 
  		 that is enriched with semantic descriptions and semantic structure obtained from 
  		 WordNet. Given a multimodal user query, we identify three strategies to select a 
  		 relevant detector from this thesaurus, namely: text matching, ontology querying, 
  		 and semantic visual querying. We evaluate the methods against the automatic search 
  		 task of the TRECVID 2005 video retrieval benchmark, using a news video archive of 
  		 85 h in combination with a thesaurus of 363 machine learned concept detectors. We 
  		 assess the influence of thesaurus size on video search performance, evaluate and 
  		 compare the multimodal selection strategies for concept detectors, and finally 
  		 discuss their combined potential using oracle fusion. The set of queries in the 
  		 TRECVID 2005 corpus is too small for us to be definite in our conclusions, but the 
  		 results suggest promising new lines of research.
  		 
		}
}

@ARTICLE{WorringTMM07,
  AUTHOR = {Marcel Worring and Guus Schreiber},
  TITLE = {Semantic Image and Video Indexing in Broad Domains},
  JOURNAL = {{IEEE} Transactions on Multimedia},
  MONTH = {August},
  YEAR = {2007},
  VOLUME = {9},
  NUMBER = {5},
  PAGES = {909--911},
  PDF = {http://www.science.uva.nl/research/mediamill/pub/worring-special-issue-tmm.pdf},
  ABSTRACT = {
  		 
  		 The six papers in this special section focus on semantic image and 
  		 video indexing in broad domains. To bring semantics to the user in 
  		 broad domains both the indexing and retrieval step have to be considered. 
  		 The papers here address both steps and the relation to ontologies.
  		 
		}
}

@ARTICLE{SnoekTMM07,
  AUTHOR = {Cees G. M. Snoek and Marcel Worring and Dennis C. Koelma and Arnold W. M. Smeulders},
  TITLE = {A Learned Lexicon-Driven Paradigm for Interactive Video Retrieval},
  JOURNAL = {{IEEE} Transactions on Multimedia},
  MONTH = {February},
  YEAR = {2007},
  VOLUME = {9},
  NUMBER = {2},
  PAGES = {280--292},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-lexicon-tmm.pdf},
  ABSTRACT = {
		
		 Effective video retrieval is the result of an interplay between
		 interactive query selection, advanced visualization of results, and
		 a goal-oriented human user. Traditional interactive video retrieval
		 approaches emphasize paradigms, such as query-by-keyword and
		 query-by-example, to aid the user in the search for relevant
		 footage. However, recent results in automatic indexing indicate that
		 query-by-concept is becoming a viable resource for interactive
		 retrieval also. We propose in this paper a new video retrieval
		 paradigm. The core of the paradigm is formed by first detecting a
		 large lexicon of semantic concepts. From there, we combine
		 query-by-concept, query-by-example, query-by-keyword, and user
		 interaction into the \emph{MediaMill} semantic video search engine.
		 To measure the impact of increasing lexicon size on interactive
		 video retrieval performance, we performed two experiments against
		 the 2004 and 2005 NIST TRECVID benchmarks, using lexicons containing
		 32 and 101 concepts respectively. The results suggest that from all
		 factors that play a role in interactive retrieval, a large lexicon
		 of semantic concepts matters most. Indeed, by exploiting large
		 lexicons, many video search questions are solvable without using
		 query-by-keyword and query-by-example. What is more, we show that
		 the lexicon-driven search engine outperforms all state-of-the-art
		 video retrieval systems in both TRECVID 2004 and 2005.
  		 
		}
}

@ARTICLE{SnoekPAMI06,
  AUTHOR = {Cees G. M. Snoek and Marcel Worring and Jan-Mark Geusebroek and Dennis C. Koelma and Frank J. Seinstra and Arnold W. M. Smeulders},
  TITLE = {The Semantic Pathfinder: Using an Authoring Metaphor for Generic Multimedia Indexing},
  JOURNAL = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
  MONTH = {October},
  YEAR = {2006},
  VOLUME = {28},
  NUMBER = {10},
  PAGES = {1678--1689},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-pathfinder-pami.pdf},
  ABSTRACT = {
  
  		 This paper presents the semantic pathfinder architecture for
  		 generic indexing of multimedia archives. The semantic pathfinder
  		 extracts semantic concepts from video by exploring different paths
  		 through three consecutive analysis steps, which we derive from the
  		 observation that produced video is the result of an
  		 authoring-driven process. We exploit this \emph{authoring
  		 metaphor} for machine-driven understanding. The pathfinder starts
  		 with the content analysis step. In this analysis step, we follow a
  		 data-driven approach of indexing semantics. The style analysis
  		 step is the second analysis step. Here we tackle the indexing
  		 problem by viewing a video from the perspective of production.
  		 Finally, in the context analysis step, we view semantics in
  		 context. The virtue of the semantic pathfinder is its ability to
  		 learn the best path of analysis steps on a per-concept basis. To
  		 show the generality of this novel indexing approach we develop
      		 detectors for a lexicon of 32 concepts and we evaluate the
  		 semantic pathfinder against the 2004 NIST TRECVID video retrieval
  	  	 benchmark, using a news archive of 64 hours. Top ranking
  		 performance in the semantic concept detection task indicates the
  		 merit of the semantic pathfinder for generic indexing of
		 multimedia archives.
  
  		}
}

@ARTICLE{SnoekTOMCCAP06,
  AUTHOR = {Cees G. M. Snoek and Marcel Worring and Alexander G. Hauptmann},
  TITLE = {Learning Rich Semantics from News Video Archives by Style Analysis},
  JOURNAL = {{ACM} Transactions on Multimedia Computing, Communications and Applications},
  MONTH = {May},
  YEAR = {2006},
  VOLUME = {2},
  NUMBER = {2},
  PAGES = {91--108},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-style-tomccap.pdf},
  ABSTRACT = {
  
  		 We propose a generic and robust framework for news video indexing, which 
  		 we found on a broadcast news production model. We identify within this 
  		 model four production phases, each providing useful metadata for annotation. 
  		 In contrast to semi-automatic indexing approaches, which exploit this 
  		 information at production time, we adhere to an automatic data-driven 
  		 approach. To that end, we analyze a digital news video using a separate 
  		 set of multimodal detectors for each production phase. By combining the 
  		 resulting production-derived features into a statistical classifier 
  		 ensemble, the framework facilitates robust classification of several rich 
  		 semantic concepts in news video; rich meaning that concepts share many 
  		 similarities in their production process. Experiments on an archive of 
  		 120 hours of news video, from the 2003 TRECVID benchmark, show that a 
  		 combined analysis of production phases yields the best results. In addition, 
  		 we demonstrate that the accuracy of the proposed style analysis framework 
  		 for classification of several rich semantic concepts is state-of-the-art.
  		 
		}
}

@ARTICLE{HollinkVISP05,
  AUTHOR = {Laura Hollink and Giang Nguyen and Dennis C. Koelma and Guus Schreiber and Marcel Worring},
  TITLE = {Assessing user behaviour in news video retrieval},
  JOURNAL = {IEE on Vision, Image and Signal Processing},
  MONTH = {December},
  YEAR = {2005},
  VOLUME = {152},
  NUMBER = {6},
  PAGES = {911-918},
  PDF = {http://staff.science.uva.nl/~giangnp/pubs/pdf/2005/Hollink05.pdf},
  ABSTRACT = {
  		
  		 The results of a study are presented, in which people queried a news
		 archive using an interactive video retrieval system. 242 search sessions
		 by 39 participants on 24 topics were assessed. Before, during and after
		 the study, participants filled in questionnaires about their expectations
		 of a search. The questionnaire data, logged user actions on the system,
		 queries formulated by users, and a quality measure of each search were
		 studied. The results of the study show that topics concerning 'specific'
		 people or objects were better retrieved than topics concerning 'general'
		 objects and scenes. Users were able to estimate the overall quality of a
		 search but did not know when the optimal result was reached within the
		 search process. Analysis of the results at various stages in the retrieval
		 process suggests that retrieval based on transcriptions of the speech in
		 video data adds more to the average precision of the result than
		 content-based image retrieval based on low-level visual features. The
		 latter is particularly useful in providing the user with an overview of
		 the dataset and thus an indication of the success of a search. Based on
		 the results, implications for the design of user interfaces of video 
		 retrieval systems are discussed.
  		
  		}
}

@ARTICLE{NguyenMS05,
  AUTHOR = {Giang P. Nguyen and Marcel Worring},
  TITLE = {Relevance feedback based saliency adaptation in {CBIR}},
  JOURNAL = {{ACM} Springer Multimedia Systems},
  MONTH = {October},
  YEAR = {2005},
  VOLUME = {10},
  NUMBER = {6},
  PAGES = {499--512},
  PDF = {http://staff.science.uva.nl/~giangnp/pubs/pdf/2005/giangACM_MS05.pdf},
  ABSTRACT = {
  		
  		 Content-based image retrieval ({CBIR}) has been under investigation
		 for a long time with many systems built to meet different
		 application demands. However, in all systems, there is still a gap
		 between the user's expectation and the system's retrieval
		 capabilities. Therefore, user interaction is an essential
		 component of any {CBIR} system. Interaction up to now has mostly
		 focused on changing global image features or similarities between
		 images. We consider the interaction with salient details in the
		 image i.e. points, lines, and regions. Interactive salient detail
		 definition goes further than summarizing the image into a set of
		 salient details. We aim to dynamically update the user- and
		 context-dependent definition of saliency based on relevance
		 feedback. To that end, we propose an interaction framework for
		 salient details from the perspective of the user. A number of
		 instantiations of the framework are presented. Finally, we apply
		 our approach for query refinement in detail based image retrieval
		 system with salient points and regions. Experimental results prove
		 the effectiveness of adapting the saliency from user feedback in
		 the retrieval process.
  
  		}
}

@ARTICLE{SnoekTMM05,
  AUTHOR = {Cees G. M. Snoek and Marcel Worring},
  TITLE = {Multimedia Event-Based Video Indexing using Time Intervals},
  JOURNAL = {{IEEE} Transactions on Multimedia},
  MONTH = {August},
  YEAR = {2005},
  VOLUME = {7},
  NUMBER = {4},
  PAGES = {638--647},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-time-mm.pdf},
  ABSTRACT = {
  		 
  		 We propose the Time Interval Multimedia Event (TIME) framework as a robust 
  		 approach for classification of semantic events in multimodal video documents. 
  		 The representation used in TIME extends the Allen time relations and allows 
  		 for proper inclusion of context and synchronization of the heterogeneous 
  		 information sources involved in multimodal video analysis. To demonstrate the 
  		 viability of our approach, it was evaluated on the domains of soccer and news 
  		 broadcasts. For automatic classification of semantic events, we compare three 
  		 different machine learning techniques, i.c. C4.5 decision tree, Maximum 
  		 Entropy, and Support Vector Machine. The results show that semantic video 
  		 indexing results significantly benefit from using the TIME framework.
  		 
		}
}

@ARTICLE{SnoekMTAP05,
  AUTHOR = {Cees G. M. Snoek and Marcel Worring},
  TITLE = {Multimodal Video Indexing: A Review of the State-of-the-art},
  JOURNAL = {Multimedia Tools and Applications},
  MONTH = {January},
  YEAR = {2005},
  VOLUME = {25},
  NUMBER = {1},
  PAGES = {5--35},
  PDF = {http://staff.science.uva.nl/~cgmsnoek/pub/snoek-review-mmta.pdf},
  ABSTRACT = {
  		 
  		 Efficient and effective handling of video documents depends on the availability 
  		 of indexes. Manual indexing is unfeasible for large video collections. In this 
  		 paper we survey several methods aiming at automating this time and resource 
  		 consuming process. Good reviews on single modality based video indexing have 
  		 appeared in literature. Effective indexing, however, requires a multimodal 
  		 approach in which either the most appropriate modality is selected or the 
  		 different modalities are used in collaborative fashion. Therefore, instead of 
  		 separately treating the different information sources involved, and their 
  		 specific algorithms, we focus on the similarities and differences between the 
  		 modalities. To that end we put forward a unifying and multimodal framework, 
  		 which views a video document from the perspective of its author. This framework 
  		 forms the guiding principle for identifying index types, for which automatic 
  		 methods are found in literature. It furthermore forms the basis for 
  		 categorizing these different methods.
  		 
		}
}

@ARTICLE{VendrigMM03,
  AUTHOR = {Jeroen Vendrig and Marcel Worring},
  TITLE = {Interactive adaptive movie annotation},
  JOURNAL = {{IEEE} MultiMedia},
  VOLUME = {10},
  NUMBER = {3},
  PAGES = {30--37},
  MONTH = {July},
  YEAR = {2003},
  PDF = {http://staff.science.uva.nl/~worring/pub/papers/i-notation.pdf},
  ABSTRACT = {
  
  		 Effectively labeling the visual content of movies is essential for annotation. We 
  		 present the interactive and adaptive i-Notation system, which describes actors\’ names, 
  		 automatically processes multimodal information sources, and deals with available 
  		 sources\’ varying quality. It provides the basis for intelligent interaction and 
  		 demonstrates significant improvements in annotation efficiency.
  		
  		}
}

@ARTICLE{VendrigTMM02,
  AUTHOR = {Jeroen Vendrig and Marcel Worring},
  TITLE = {Systematic evaluation of Logical Story Unit Segmentation},
  JOURNAL = {{IEEE} Transactions on Multimedia},
  VOLUME = {4},
  NUMBER = {4},
  PAGES = {492-499},
  MONTH = {December},
  YEAR = {2002},
  PDF = {http://staff.science.uva.nl/~worring/pub/papers/vendrig-lsu-eval.pdf},
  ABSTRACT = {
  		 
  		 Although various Logical Story Unit (LSU) segmentation methods based on visual content 
  		 have been presented in literature, a common ground for comparison is missing. We present 
  		 a systematic evaluation of the mutual dependencies of segmentation methods and their 
  		 performances. LSUs are subjective and cannot be defined with full certainty. To limit 
  		 subjectivity, we present definitions based on film theory. For evaluation, we introduce 
  		 a method measuring the quality of a segmentation method and its economic impact rather 
  		 than the amount of errors. Furthermore, the inherent complexity of the segmentation 
  		 problem given a visual feature is measured. Also, we show to what extent LSU segmentation 
  		 depends on the quality of shot boundary segmentation. To understand LSU segmentation, we 
  		 present a unifying framework classifying segmentation methods into four essentially 
  		 different types. We present results of an evaluation of the four types under similar 
  		 circumstances using an unprecedented amount of 20 hours of 17 complete videos in different 
  		 genres. Tools and ground truths are available for interactive use via internet.
		
		}
}

@ARTICLE{VendrigMTAP01,
  AUTHOR = {Jeroen Vendrig and Marcel Worring and Arnold W. M. Smeulders},
  TITLE = {Filter Image Browsing: Interactive Image Retrieval by Using Database Overviews},
  JOURNAL = {Multimedia Tools and Applications},
  MONTH = {September},
  YEAR = {2001},
  VOLUME = {15},
  NUMBER = {1},
  PAGES = {83--103},
  PDF = {http://staff.science.uva.nl/~worring/pub/papers/vendrig-fib.pdf},
  ABSTRACT = {
  		 
		 Human-computer interaction is a decisive factor in effective content-based access to 
		 large image repositories. In current image retrieval systems the user refines his query 
		 by selecting example images from a relevance ranking. Since the top ranked images are 
		 all similar, user feedback often results in rearrangement of the presented images only.
		 For better incorporation of user interaction in the retrieval process, we have developed 
		 the Filter Image Browsing method. It also uses feedback through image selection. However, 
		 it is based on differences between images rather than similarities. Filter Image Browsing 
		 presents overviews of relevant parts of the database to users. Through interaction users 
		 then zoom in on parts of the image collection. By repeatedly limiting the information 
		 space, the user quickly ends up with a small amount of relevant images. The method can 
		 easily be extended for the retrieval of multimedia objects.For evaluation of the Filter 
		 Image Browsing retrieval concept, a user simulation is applied to a pictorial database 
		 containing 10,000 images acquired from the World Wide Web by a search robot. The 
		 simulation incorporates uncertainty in the definition of the information need by users. 
		 Results show Filter Image Browsing outperforms plain interactive similarity ranking in 
		 required effort from the user. Also, the method produces predictable results for 
		 retrieval sessions, so that the user quickly knows if a successful session is possible 
		 at all. Furthermore, the simulations show the overview techniques are suited for 
		 applications such as hand-held devices where screen space is limited.
  		 
		}
}

@ARTICLE{SmeuldersPAMI00,
  AUTHOR = {Arnold W. M. Smeulders and Marcel Worring and Simone Santini and Amarnath Gupta and Ramesh Jain},
  TITLE = {Content-Based Image Retrieval at the End of the Early Years},
  JOURNAL = {{IEEE} Transactions on Pattern Analysis and Machine Intelligence},
  VOLUME = {22},
  NUMBER = {12},
  PAGES = {1349--1380},
  YEAR = 2000,
  PDF = {http://www.science.uva.nl/~smeulder/pubs/PAMI2000Smeulders.pdf},
  ABSTRACT = {
  
  		 The paper presents a review of 200 references in content-based image retrieval. The 
                 paper starts with discussing the working conditions of content-based retrieval: 
                 patterns of use, types of pictures, the role of semantics, and the sensory gap. 
                 Subsequent sections discuss computational steps for image retrieval systems. Step one 
                 of the review is image processing for retrieval sorted by color, texture, and local 
                 geometry. Features for retrieval are discussed next, sorted by: accumulative and 
                 global features, salient points, object and shape features, signs, and structural 
                 combinations thereof. Similarity of pictures and objects in pictures is reviewed for 
                 each of the feature types, in close connection to the types and means of feedback the 
                 user of the systems is capable of giving by interaction. We briefly discuss aspects 
                 of system engineering: databases, system architecture, and evaluation. In the 
                 concluding section, we present our view on: the driving force of the field, the 
                 heritage from computer vision, the influence on computer vision, the role of 
                 similarity and of interaction, the need for databases, the problem of evaluation, and 
                 the role of the semantic gap.
		
		}
}