You have to implement two unsupervised learning methods in python from the list of proposed algorithms in python extending the library of methods used in the course (AMLTLearn).

You should fork the library from github so it is more easy to integrate your code later once it is working. When implementing the algorithms you must follow the API conventions used by scikit-learn.

Each student has to pick a different option, so you must send an email to bejar@cs.upc.edu as soon as possible telling what option you are going to implement.

You can propose your own coursework If you are interested in other unsupervised algorithms, send and email or ask your professor during class.

Apart from the implementation of the algorithms you have to write a report comparing the algorithms to other similar algorithms using generated and real datasets (for instance from UCI).

You have a document posted in the raco explaining the assignment and evaluation criteria.

  The deadline for delivering the code and the report is January 9th.

Option 1 (Assigned)

  • Hopkins Statistic for data clusterness (ask me for the reference)
  • Ying-Yang k-means
@inproceedings{ding2015yinyang,
title={Yinyang k-means: A drop-in replacement of the classic k-means with consistent speedup},
author={Ding, Yufei and Zhao, Yue and Shen, Xipeng and Musuvathi, Madanlal and Mytkowicz, Todd},
booktitle={Proceedings of the 32nd International Conference on Machine Learning (ICML-15)},
pages={579--587},
year={2015}
}

Option 2 (Assigned)

  • Distance Based Outlier detection (ask me for the reference)
  • Bisecting k-means
@inproceedings{steinbach2000comparison,
title={A comparison of document clustering techniques},
author={Steinbach, Michael and Karypis, George and Kumar, Vipin and others},
booktitle={KDD workshop on text mining},
volume={400},
number={1},
pages={525--526},
year={2000},
organization={Boston}
}

Option 3 (Assigned)

  • Volkovich et al. Stability cluster validity index
@inproceedings{Volkovich2009,
author = {Volkovich, Z. and Barzily, Zeev and Avros, Renata and Toledano-Kitai, Dvora},
title = {On application of the K-nearest neighbors approach for cluster validation},
booktitle = {International Conference Applied Stochastic Models and Data Analysis},
year = {2009},
pages = {468472},
}
  • Capó, Marco; Pérez, Aritz; Lozano, Jose A.
@article{Capo2016,
author = {Capó, Marco and Pérez, Aritz and Lozano, Jose A.},
title = {An efficient approximation to the K-means clustering for massive data},
journal = {Knowledge-Based Systems},
year = {2016},
doi = {http://dx.doi.org/10.1016/j.knosys.2016.06.031},
}

Option 4 (Assigned)

  • The gap statistic for assessing the number of clusters
@article{tibshirani2001estimating,
title={Estimating the number of clusters in a data set via the gap statistic},
author={Tibshirani, Robert and Walther, Guenther and Hastie, Trevor},
journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
volume={63},
number={2},
pages={411--423},
year={2001},
publisher={Wiley Online Library}
}
  • Scalable hierarchical clustering
@article{Patra2011,
author = {Patra, Bidyut Kr and Nandi, Sukumar and Viswanath, P.},
title = {A distance based clustering method for arbitrary shaped clusters in large datasets},
journal = {Pattern Recognition},
year = {2011},
volume = {44},
pages = {2862--2870},
number = {12},
doi = {10.1016/j.patcog.2011.04.027},
}

Option 5 (Assigned)

  • Dash&Liu unsupervised feature selection algorithm
@article{Dash1999,
author = {Dash, Manoranjan and Liu, Huan},
title = {Handling Large Unsupervised Data via Dimensionality Reduction},
journal = {SIGMOD Workshop on Research Issues on Data Mining and Knowledge Discovery},
year = {1999},
}
  • Rough DBSCAN
@article{Viswanath2009,
author = {Viswanath, P. and Babu, V. Suresh},
title = {Rough-DBSCAN: A fast hybrid density based clustering method for large data sets},
journal = {Pattern Recognition Letters},
year = {2009},
volume = {30},
pages = {1477--1488},
number = {16},
doi = {10.1016/j.patrec.2009.08.008},
}

Option 6 (Assigned)

  • Mitra&Murty&Pal unsupervised feature selection algorithm
@article{Mitra2002,
author = {Mitra, Pabitra and Murthy, C. A. and Pal, Sankar K.},
title = {Unsupervised Feature Selection Using Feature Similarity},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2002},
volume = {24},
pages = {301--312},
number = {3},
}
  • Probabilistic hierarchical clustering (ask me for the reference)

Option 7 (Assigned)

  • Implement some of the methods for ensemble clustering described in
@ARTICLE{Iam-On2015,
author = {Iam-On, Natthakan and Boongoen, Tossapon},
title = {Comparative study of matrix refinement approaches for ensemble clustering},
journal = {Machine Learning},
year = {2015},
volume = {98},
pages = {269-300},
number = {1-2},
doi = {10.1007/s10994-013-5342-y},
issn = {0885-6125},
keywords = {Cluster ensemble; Multiple clusterings; Summarization; Information
matrix},
language = {English},
publisher = {Springer US},
url = {http://dx.doi.org/10.1007/s10994-013-5342-y}
}

Option 8 (Assigned)

  • Hamerly, G. & Elkan, C. Harmonic K-means
@INPROCEEDINGS{Hamerly2002,
author = {Greg Hamerly and Charles Elkan},
title = {Alternatives to the k-means algorithm that find better clusterings},
booktitle = {Proceedings of the Eleventh International Conference on Information
and Knowledge Management ({CIKM}-02)},
year = {2002},
editor = {Konstantinos Kalpakis and Nazli Goharian and David Grossmann},
pages = {600--607},
address = {New York},
month = nov # { ~4--9},
publisher = {ACM Press}
}
  • Lago-Fernández, L. F. & Corbacho, F. Normality-based validation for crisp clustering
  • @ARTICLE{Lago-Fernandez2010,
    author = {Luis F. Lago-Fernández and Fernando Corbacho},
    title = {Normality-based validation for crisp clustering},
    year = {2010},
    volume = {43},
    number = {3},
    pages = {782 - 795},
    issn = {0031-3203},
    doi = {10.1016/j.patcog.2009.09.018},
    url = {http://www.sciencedirect.com/science/article/pii/S0031320309003628},
    journal = {Pattern Recognition},
    keywords = {Clustering Validation}

    }

    Option 9 (Assigned)

    • Implement the method described in this paper
    @INPROCEEDINGS{Cleuziou2008,
    author = {G. Cleuziou},
    title = {An extended version of the k-means method for overlapping clustering},
    booktitle = {Pattern Recognition, 2008. ICPR 2008. 19th International Conference
    on},
    year = {2008},
    pages = {1-4},
    month = {Dec},
    doi = {10.1109/ICPR.2008.4761079},
    issn = {1051-4651},
    keywords = {fuzzy set theory;pattern clustering;data coverage;fuzzy clustering;k-means
    method extended version;overlapping clustering;overlapping k-means
    algorithm;Clustering algorithms;Clustering methods;Constraint optimization;Data
    analysis;Degradation;Information retrieval;Machine learning;Machine
    learning algorithms;Partitioning algorithms;Space exploration}
    }
  • Krista Rizman Zalik and Borut Zalik Validity index for clusters of different sizes and densities
  • @Article{journals/prl/ZalikZ11,
    title = "Validity index for clusters of different sizes and
    densities",
    author = "Krista Rizman Zalik and Borut Zalik",
    journal = "Pattern Recognition Letters",
    year = "2011",
    number = "2",
    volume = "32",
    bibdate = "2011-01-08",
    bibsource = "DBLP,
    http://dblp.uni-trier.de/db/journals/prl/prl32.html#ZalikZ11",
    pages = "221--234",
    URL = "http://dx.doi.org/10.1016/j.patrec.2010.08.007",
    }

    Option 10 (Assigned)

    • Implement the method for generating random clusters described in this paper (there is a R implementation in the package clusterGeneration)
    @Article{oai:oai.columbia.edu:epic/nsdl/1/110242,
    title = "Generation of Random Clusters with Specified Degree of
    Separation",
    author = "Weiliang Qiu and Harry Joe",
    publisher = "Springer-Verlag",
    year = "2006",
    ISSN = "1432-1343",
    bibsource = "OAI-PMH server at oai.columbia.edu",
    identifier = "Journal of Classification 23(2), 315-334. (2006)",
    language = "En",
    oai = "oai:oai.columbia.edu:epic/nsdl/1/110242",
    relation = "0176-4268; 1432-1343",
    subject = "Statistics; Pattern Recognition; Bioinformatics;
    Statistical Theory and Methods; Signal, Image and
    Speech Processing; Marketing; Psychometrics",
    URL = "http://dx.doi.org/10.1007/s00357-006-0018-y",
    }

    Option 11 (Assigned)

    • Implement the seeded and constrained K-means algorithms described in the paper
    @inproceedings{basu2002semi,
    title={Semi-supervised Clustering by Seeding},
    author={Basu, Sugato and Banerjee, Arindam and Mooney, Raymond J},
    booktitle={Proceedings of the Nineteenth International Conference on Machine Learning},
    pages={27--34},
    year={2002},
    organization={Morgan Kaufmann Publishers Inc.}
    }