Beispiel #1
0
        private Cluster[] GetCentroids(short[] docIds)
        {
            Cluster[]     clusters = GetInitialClusters(docIds, getSampleSize(docIds.Length));
            DocTermMatrix matrix   = new DocTermMatrix(docIds, GetIndex);

            do
            {
                int   clusterToMerge1 = 0;
                int   clusterToMerge2 = 0;
                float sim             = 0;
                float newSim          = 0;
                for (int i = 0; i < clusters.Length; i++)
                {
                    for (int j = 0; j < clusters.Length; j++)
                    {
                        if (i != j)
                        {
                            newSim = getSimilarity(clusters[i], clusters[j], matrix);
                            if (newSim > sim)
                            {
                                clusterToMerge1 = i;
                                clusterToMerge2 = j;
                                sim             = newSim;
                            }
                        }
                    }
                }
                clusters = mergeClusters(clusterToMerge1, clusterToMerge2, clusters);
            }while (clusters.Length > GetK);

            return(clusters);
        }
Beispiel #2
0
        private float getSimilarity(Cluster c1, Cluster c2, DocTermMatrix matrix)
        {
            float centroid1 = matrix.GetCentroid(c1);
            float centroid2 = matrix.GetCentroid(c2);
            float norm1     = matrix.GetNorm(c1);
            float norm2     = matrix.GetNorm(c2);

            return((centroid1 * centroid2) / (norm1 * norm2));
        }
Beispiel #3
0
        public override Cluster[] CalculateClusters(short[] docIds, int commonTerms)
        {
            DocTermMatrix matrix = new DocTermMatrix(docIds, GetIndex);

            Cluster[] prevClusters = GetEmptyClusters(GetK);

            Cluster[] newClusters;
            if (centroids == null)
            {
                newClusters = GetInitialClusters(docIds, GetK);
            }
            else
            {
                newClusters = centroids;
            }

            int iterations    = 0;
            int maxIterations = 10;

            do
            {
                prevClusters = newClusters;             //save previous clusters
                newClusters  = GetEmptyClusters(GetK);  //empty new clusters and re-populate, use prevClusters as means

                for (int i = 0; i < docIds.Length; i++) //for each doc
                {
                    short docId           = docIds[i];
                    float sim             = 0;
                    int   assignedCluster = -1;
                    for (int j = 0; j < GetK; j++) //calculate similarity with each clustern
                    {
                        float newSim = getAvgSimilarity(docId, prevClusters[j], matrix);
                        if (newSim > sim) //if new similarity is higher - update similarity and asign to new cluster
                        {
                            sim = newSim;
                            assignedCluster++;
                        }
                    }
                    if (assignedCluster > -1) //assign to cluster ONLY if sim to any cluster is > 0
                    {
                        newClusters[assignedCluster].AddDoc(docId);
                    }
                }
            } while (iterations++ < maxIterations && !areConverged(prevClusters, newClusters));

            if (getTerms)
            {
                return(LoadCommonTerms(removeEmptyClusters(newClusters), commonTerms));
            }
            else
            {
                return(removeEmptyClusters(newClusters));
            }
        }
Beispiel #4
0
        private float getAvgSimilarity(short docId, Cluster c, DocTermMatrix matrix)
        {
            float                 sim    = 0;
            Hashtable             docIds = c.DocIds;
            IDictionaryEnumerator en     = docIds.GetEnumerator();

            while (en.MoveNext())
            {
                sim += matrix.GetSimilarity(docId, Convert.ToInt16(en.Key));
            }

            return(sim / c.DocumentCount);
        }
Beispiel #5
0
 public ClusteringAnalyzer(Index index, short[] allDocIds)
 {
     this.index     = index;
     this.allDocIds = allDocIds;
     matrix         = new DocTermMatrix(allDocIds, index);
 }