private Cluster[] GetCentroids(short[] docIds) { Cluster[] clusters = GetInitialClusters(docIds, getSampleSize(docIds.Length)); DocTermMatrix matrix = new DocTermMatrix(docIds, GetIndex); do { int clusterToMerge1 = 0; int clusterToMerge2 = 0; float sim = 0; float newSim = 0; for (int i = 0; i < clusters.Length; i++) { for (int j = 0; j < clusters.Length; j++) { if (i != j) { newSim = getSimilarity(clusters[i], clusters[j], matrix); if (newSim > sim) { clusterToMerge1 = i; clusterToMerge2 = j; sim = newSim; } } } } clusters = mergeClusters(clusterToMerge1, clusterToMerge2, clusters); }while (clusters.Length > GetK); return(clusters); }
private float getSimilarity(Cluster c1, Cluster c2, DocTermMatrix matrix) { float centroid1 = matrix.GetCentroid(c1); float centroid2 = matrix.GetCentroid(c2); float norm1 = matrix.GetNorm(c1); float norm2 = matrix.GetNorm(c2); return((centroid1 * centroid2) / (norm1 * norm2)); }
public override Cluster[] CalculateClusters(short[] docIds, int commonTerms) { DocTermMatrix matrix = new DocTermMatrix(docIds, GetIndex); Cluster[] prevClusters = GetEmptyClusters(GetK); Cluster[] newClusters; if (centroids == null) { newClusters = GetInitialClusters(docIds, GetK); } else { newClusters = centroids; } int iterations = 0; int maxIterations = 10; do { prevClusters = newClusters; //save previous clusters newClusters = GetEmptyClusters(GetK); //empty new clusters and re-populate, use prevClusters as means for (int i = 0; i < docIds.Length; i++) //for each doc { short docId = docIds[i]; float sim = 0; int assignedCluster = -1; for (int j = 0; j < GetK; j++) //calculate similarity with each clustern { float newSim = getAvgSimilarity(docId, prevClusters[j], matrix); if (newSim > sim) //if new similarity is higher - update similarity and asign to new cluster { sim = newSim; assignedCluster++; } } if (assignedCluster > -1) //assign to cluster ONLY if sim to any cluster is > 0 { newClusters[assignedCluster].AddDoc(docId); } } } while (iterations++ < maxIterations && !areConverged(prevClusters, newClusters)); if (getTerms) { return(LoadCommonTerms(removeEmptyClusters(newClusters), commonTerms)); } else { return(removeEmptyClusters(newClusters)); } }
private float getAvgSimilarity(short docId, Cluster c, DocTermMatrix matrix) { float sim = 0; Hashtable docIds = c.DocIds; IDictionaryEnumerator en = docIds.GetEnumerator(); while (en.MoveNext()) { sim += matrix.GetSimilarity(docId, Convert.ToInt16(en.Key)); } return(sim / c.DocumentCount); }
public ClusteringAnalyzer(Index index, short[] allDocIds) { this.index = index; this.allDocIds = allDocIds; matrix = new DocTermMatrix(allDocIds, index); }