@inproceedings{769f3691c2f34725a496f8da0a41b2f8,
title = "Measuring the complexity of a collection of documents",
abstract = "Some text collections are more difficult to search or more complex to organize into topics than others. What properties of the data characterize this complexity? We use a variation of the Cox-Lewis statistic to measure the natural tendency of a set of points to fall into clusters. We compute this quantity for document collections that are represented as a set of term vectors. We consider applications of the Cox-Lewis statistic in three scenarios: comparing clusterability of different text collections using the same representation, comparing different representations of the same text collection, and predicting the query performance based on the clusterability of the query results set. Our experimental results show a correlation between the observed effectiveness and this statistic, thereby demonstrating the utility of such data analysis in text retrieval.",
author = "Vishwa Vinay and Cox, \{Ingemar J.\} and Natasa Milic-Frayling and Ken Wood",
year = "2006",
doi = "10.1007/11735106\_11",
language = "English",
isbn = "3540333479",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "107--118",
booktitle = "Advances in Information Retrieval - 28th European Conference on IR Research, ECIR 2006, Proceedings",
address = "Germany",
note = "28th European Conference on Information Retrieval Research, ECIR 2006 ; Conference date: 10-04-2006 Through 12-04-2006",
}