private bool mergeClosestClusters(int numUsers, List <FastIDSet> clusters, bool done) { // We find a certain number of closest clusters... List <ClusterClusterPair> queue = findClosestClusters(numUsers, clusters); // List<ClusterClusterPair> queue = new List<ClusterClusterPair>(); //foreach (var item in _queue) //{ // queue.Enqueue(item); //} // The first one is definitely the closest pair in existence so we can cluster // the two together, put it back into the set of clusters, and start again. Instead // we assume everything else in our list of closest cluster pairs is still pretty good, // and we cluster them too. for (int n = 0; n < queue.Count; n++) { //} //while (queue.Count > 0) //{ if (!clusteringByThreshold && clusters.Count <= numClusters) { done = true; break; } ClusterClusterPair top = queue[n]; queue.RemoveAt(n); if (clusteringByThreshold && top.getSimilarity() < clusteringThreshold) { done = true; break; } FastIDSet cluster1 = top.getCluster1(); FastIDSet cluster2 = top.getCluster2(); // Pull out current two clusters from clusters var clusterIterator = clusters; bool removed1 = false; bool removed2 = false; for (int m = 0; m < clusterIterator.Count; m++) { if (!(removed1 && removed2)) { FastIDSet current = clusterIterator[m]; // Yes, use == here if (!removed1 && cluster1 == current) { clusterIterator.RemoveAt(m); m--; removed1 = true; } else if (!removed2 && cluster2 == current) { clusterIterator.RemoveAt(m); m--; removed2 = true; } } // The only catch is if a cluster showed it twice in the list of best cluster pairs; // have to remove the others. Pull out anything referencing these clusters from queue for (int k = 0; k < queue.Count; k++) { //} // for (Iterator<ClusterClusterPair> queueIterator = queue.iterator(); queueIterator.hasNext(); ) // { ClusterClusterPair pair = queue[k]; FastIDSet pair1 = pair.getCluster1(); FastIDSet pair2 = pair.getCluster2(); if (pair1 == cluster1 || pair1 == cluster2 || pair2 == cluster1 || pair2 == cluster2) { queue.RemoveAt(k); //queueIterator.remove(); } } // Make new merged cluster FastIDSet merged = new FastIDSet(cluster1.size() + cluster2.size()); merged.addAll(cluster1); merged.addAll(cluster2); // Compare against other clusters; update queue if needed // That new pair we're just adding might be pretty close to something else, so // catch that case here and put it back into our queue for (var i = 0; i < clusters.Count; i++) { FastIDSet cluster = clusters[i]; double similarity = clusterSimilarity.getSimilarity(merged, cluster); if (similarity > queue[queue.Count - 1].getSimilarity()) { var queueIterator = queue.GetEnumerator(); while (queueIterator.MoveNext()) { if (similarity > queueIterator.Current.getSimilarity()) { n--; // queueIterator.previous(); break; } } queue.Add(new ClusterClusterPair(merged, cluster, similarity)); } } // Finally add new cluster to list clusters.Add(merged); } } return(done); }
private void BuildClusters() { try { buildClustersLock.Lock(); DataModel model = this.DataModel; int numUsers = model.GetNumUsers(); if (numUsers == 0) { topRecsByUserID = new Dictionary <object, IList <RecommendedItem> >(); clustersByUserID = new Dictionary <object, ICollection <User> >(); } else { LinkedList <ICollection <User> > clusters = new LinkedList <ICollection <User> >(); // Begin with a cluster for each user: foreach (User user in model.GetUsers()) { ICollection <User> newCluster = new HashedSet <User>(); newCluster.Add(user); clusters.AddLast(newCluster); } bool done = false; while (!done) { // We find a certain number of closest clusters... bool full = false; LinkedList <ClusterClusterPair> queue = new LinkedList <ClusterClusterPair>(); int i = 0; LinkedListNode <ICollection <User> > it2 = clusters.First; foreach (ICollection <User> cluster1 in clusters) { i++; //ListIterator<ICollection<User>> it2 = clusters.listIterator(i); it2 = it2.Next; while (it2.Next != null) { it2 = it2.Next; ICollection <User> cluster2 = it2.Value; double similarity = clusterSimilarity.GetSimilarity(cluster1, cluster2); if (!double.IsNaN(similarity) && (!full || similarity > queue.Last.Value.Similarity)) { LinkedListNode <ClusterClusterPair> qit = queue.Last; /// loop looks fishy while (qit.Previous != null) { if (similarity <= qit.Previous.Value.Similarity) { break; } qit = qit.Previous; } queue.AddAfter(qit, new ClusterClusterPair(cluster1, cluster2, similarity)); if (full) { queue.RemoveLast(); } else if (queue.Count > numUsers) { // use numUsers as queue size limit full = true; queue.RemoveLast(); } } } } // The first one is definitely the closest pair in existence so we can cluster // the two together, put it back into the set of clusters, and start again. Instead // we assume everything else in our list of closest cluster pairs is still pretty good, // and we cluster them too. while (queue.Count > 0) { if (!clusteringByThreshold && clusters.Count <= numClusters) { done = true; break; } ClusterClusterPair top = queue.First.Value; queue.RemoveFirst(); if (clusteringByThreshold && top.Similarity < clusteringThreshold) { done = true; break; } ICollection <User> cluster1 = top.Cluster1; ICollection <User> cluster2 = top.Cluster2; // Pull out current two clusters from clusters clusters.Remove(cluster1); clusters.Remove(cluster2); // The only catch is if a cluster showed it twice in the list of best cluster pairs; // have to remove the others. Pull out anything referencing these clusters from queue for (LinkedListNode <ClusterClusterPair> qit = queue.First; qit != null; qit = qit.Next) { ClusterClusterPair pair = qit.Value; ICollection <User> pair1 = pair.Cluster1; ICollection <User> pair2 = pair.Cluster2; if (pair1 == cluster1 || pair1 == cluster2 || pair2 == cluster1 || pair2 == cluster2) { if (qit == queue.First) { queue.RemoveFirst(); qit = queue.First; continue; } else { LinkedListNode <ClusterClusterPair> temp = qit; qit = qit.Previous; queue.Remove(temp); } } } // Make new merged cluster HashedSet <User> merged = new HashedSet <User>(/*cluster1.Count + cluster2.Count*/); merged.AddAll(cluster1); merged.AddAll(cluster2); // Compare against other clusters; update queue if needed // That new pair we're just adding might be pretty close to something else, so // catch that case here and put it back into our queue foreach (ICollection <User> cluster in clusters) { double similarity = clusterSimilarity.GetSimilarity(merged, cluster); if (similarity > queue.Last.Value.Similarity) { // Iteration needs to be validated agains Java version LinkedListNode <ClusterClusterPair> qit = queue.First; while (qit.Next != null) { if (similarity > qit.Next.Value.Similarity) { break; } qit = qit.Next; } queue.AddAfter(qit, new ClusterClusterPair(merged, cluster, similarity)); } } // Finally add new cluster to list clusters.AddLast(merged); } } topRecsByUserID = ComputeTopRecsPerUserID(clusters); clustersByUserID = ComputeClustersPerUserID(clusters); } clustersBuilt = true; } finally { buildClustersLock.Unlock(); } }