void _ClusterDataset() { Dispatcher.Invoke((SimpleDelegate) delegate() { _statusMessage.Add("Downloading dataset..."); }); var uri = new Uri("https://archive.ics.uci.edu/ml/machine-learning-databases/00307/%5bUCI%5d%20AAAI-14%20Accepted%20Papers%20-%20Papers.csv"); var KEYWORD_SPLIT = " \n".ToCharArray(); var TOPIC_SPLIT = "\n".ToCharArray(); // download the document list var docList = new List <AAAIDocument>(); using (var client = new WebClient()) { var data = client.DownloadData(uri); Dispatcher.Invoke((SimpleDelegate) delegate() { _statusMessage.Add("Building data table..."); }); // parse the file CSV var dataTable = new StreamReader(new MemoryStream(data)).ParseCSV(','); // create strongly typed documents from the data table dataTable.ForEach(row => docList.Add(new AAAIDocument { Abstract = row.GetField <string>(5), Keyword = row.GetField <string>(3).Split(KEYWORD_SPLIT, StringSplitOptions.RemoveEmptyEntries).Select(str => str.ToLower()).ToArray(), Topic = row.GetField <string>(4).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries), Group = row.GetField <string>(2).Split(TOPIC_SPLIT, StringSplitOptions.RemoveEmptyEntries), Title = row.GetField <string>(0) })); } // create a document lookup table var docTable = docList.ToDictionary(d => d.Title, d => d); // extract features from the document's metadata var stringTable = new StringTableBuilder(); var classificationSet = new SparseVectorClassificationSet { Classification = docList.Select(d => d.AsClassification(stringTable)).ToArray() }; // normalise the document/t var encodings = classificationSet.Vectorise(true); // convert the sparse feature vectors into dense vectors var documentClusterList = new List <DocumentCluster>(); using (var lap = Provider.CreateLinearAlgebra()) { var lookupTable = encodings .Select(d => Tuple.Create(d, lap.Create(d.Data).AsIndexable())) .ToDictionary(d => d.Item2, d => docTable[d.Item1.Classification]) ; var vectorList = lookupTable.Select(d => d.Key).ToList(); Dispatcher.Invoke((SimpleDelegate) delegate() { _statusMessage.Add("Clustering data..."); }); // cluster the dense vectors using (var nnmf = new NNMF(lap, vectorList, _clusterColour.Length)) { var clusters = nnmf.Cluster(40, cost => { Dispatcher.Invoke((SimpleDelegate) delegate() { _statusMessage.Add("NNMF error: " + cost.ToString()); }); }); // create document clusters from the NNMF results int index = 0; foreach (var cluster in clusters) { var documentCluster = new List <AAAIDocument>(); foreach (var item in cluster) { var document = lookupTable[item]; documentCluster.Add(document); } var desc = String.Join(", ", nnmf.GetRankedFeatures(index++) .Select(i => stringTable.GetString(i)) .Take(32) ); documentClusterList.Add(new DocumentCluster(documentCluster, desc)); } // collect the cluster membership for each document for (int i = 0, len = vectorList.Count; i < len; i++) { lookupTable[vectorList[i]].ClusterMembership = nnmf.GetClusterMembership(i); } } } Dispatcher.Invoke((SimpleDelegate) delegate() { _UpdateUI(documentClusterList); }); }