Пример #1
0
        public Cluster(Random generator, int jobId, int k = 2, int maxIteration = 200)
        {
            Clusters      = k;
            MaxIterations = maxIteration;
            JobId         = jobId;

            LSA.GetMatrixContainer(JobId);

            var clusterCentersDocIndex = new HashSet <int>();

            while (clusterCentersDocIndex.Count <= Clusters)
            {
                clusterCentersDocIndex.Add(generator.Next(LSA.MatrixContainer.VMatrix.ColumnCount));
            }

            for (var i = 0; i < Clusters; i++)
            {
                Centers[i] = LSA.MatrixContainer.VMatrix.Column(clusterCentersDocIndex.ElementAt(i)).ToArray();
            }

            var counter = 0;

            var start = DateTime.Now;

            while (!IsOptimized && ++counter < MaxIterations)
            {
                CalcDistancesAndAssign();

                MoveCenters();
            }

            Debug.WriteLine($"----{Clusters}----Total Iterations: {counter}");

            for (var i = 0; i < Clusters; i++)
            {
                DocsPerCluster[i] = ClusterMap.Count(c => c.Value == i);
            }

            Debug.WriteLine($"----{Clusters}----Total Cluster Calc: {DateTime.Now.Subtract(start).TotalMilliseconds} Milliseconds");

            var calcSiStart = DateTime.Now;

            // Silhouette (clustering)
            // https://en.wikipedia.org/wiki/Silhouette_(clustering)

            ClusterMap.AsParallel().ForAll(kvp =>
            {
                // Get aI
                var currentCluster = kvp.Value;

                var targetCluster = ClusterMap.Where(c => c.Value == currentCluster).ToList();
                var distanceTotal = new List <float>();

                for (var i = 0; i < targetCluster.Count; i++)
                {
                    var distance = LSA.MatrixContainer.DistanceMap[kvp.Key, targetCluster[i].Key];

                    if (!float.IsInfinity(distance) && !float.IsNaN(distance))
                    {
                        distanceTotal.Add(distance);
                    }
                }

                var aI = distanceTotal.Sum() / DocsPerCluster[currentCluster];

                // Get bI
                var otherClusterDistances = new List <float>();

                for (var i = 0; i < Clusters; i++)
                {
                    if (i == currentCluster)
                    {
                        continue;
                    }

                    var otherTargetCluster = ClusterMap.Where(c => c.Value == i).ToList();
                    var distanceOtherTotal = new List <float>();

                    for (var m = 0; m < otherTargetCluster.Count; m++)
                    {
                        var distanceOther = LSA.MatrixContainer.DistanceMap[kvp.Key, otherTargetCluster[m].Key];

                        if (!float.IsInfinity(distanceOther) && !float.IsNaN(distanceOther))
                        {
                            distanceOtherTotal.Add(distanceOther);
                        }
                    }

                    var otherDistanceAverage = distanceOtherTotal.Sum() / DocsPerCluster[i];

                    if (!float.IsInfinity(otherDistanceAverage) && !float.IsNaN(otherDistanceAverage))
                    {
                        otherClusterDistances.Add(otherDistanceAverage);
                    }
                }

                var bI = otherClusterDistances.Min();

                var sI = (bI - aI) / Math.Max(aI, bI);

                if (!float.IsInfinity(sI) && !float.IsNaN(sI))
                {
                    DocumentSi.AddOrUpdate(kvp.Key, sI, (i, f) => sI);
                    ClusterSi.Add(Tuple.Create(currentCluster, sI));
                }
            });

            // Calc Cluster Si Averages
            for (var m = 0; m < Clusters; m++)
            {
                if (ClusterSi.Any(c => c.Item1 == m))
                {
                    ClusterSiAverages[m] = ClusterSi
                                           .Where(c => c.Item1 == m)
                                           .Select(c => c.Item2)
                                           .Average();
                }
            }

            // Calculate Si Averages
            GlobalSi = DocumentSi.Average(kvp => kvp.Value);
            GlobalClusterSiAverage = ClusterSiAverages.Average((kvp) => kvp.Value);

            Debug.WriteLine($"****{Clusters}***** GlobalSi: {GlobalSi}");
            Debug.WriteLine($"****{Clusters}***** GlobalClusterSiAverage: {GlobalClusterSiAverage}");
            Debug.WriteLine($"----{Clusters}---- Total Cluster SI Calc: {DateTime.Now.Subtract(calcSiStart).TotalMilliseconds} Milliseconds");
        }
Пример #2
0
        public void Save(SvdEntities context, ClusterCalculation clusterCalculationEntity)
        {
            var binaryFormatter = new BinaryFormatter();

            var jobDocs         = context.JobDocuments.Where(jd => jd.JobId == JobId).ToLookup(jd => jd.OrdinalIndex);
            var jobTerms        = context.JobTerms.Where(jd => jd.JobId == JobId).ToLookup(jt => jt.Term.Value);
            var clusterEntities = new Dictionary <int, Engine.Cluster>();

            clusterCalculationEntity.ClusterCount = Clusters;
            clusterCalculationEntity.GlobalSi     = GlobalSi;
            clusterCalculationEntity.ClusterSi    = GlobalClusterSiAverage;

            // Update Cluster Calculation
            context.SaveChanges();

            Enumerable.Range(0, Clusters).ToList().ForEach(cluster =>
            {
                using (var memoryStreamCenterVector = new MemoryStream())
                {
                    binaryFormatter.Serialize(memoryStreamCenterVector, Centers[cluster]);

                    memoryStreamCenterVector.Position = 0;

                    clusterEntities.Add(cluster, new Engine.Cluster()
                    {
                        JobId = JobId,
                        ClusterCalculationId = clusterCalculationEntity.Id,
                        Si = ClusterSiAverages[cluster],
                        CenterVectorSerialized = memoryStreamCenterVector.ToArray()
                    });
                }
            });

            // Insert Clusters
            context.BulkInsert(clusterEntities.Select(kvp => kvp.Value));

            var clusterJobDocumentEntities = new ConcurrentBag <ClusterJobDocument>();
            var clusterJobTermEntities     = new ConcurrentBag <ClusterJobTerm>();

            clusterEntities.AsParallel().ForAll(clusterEntity =>
            {
                using (var memoryStreamCenterVector = new MemoryStream())
                {
                    var termDistanceMap = new Dictionary <string, float>();
                    var centerVector    = Centers[clusterEntity.Key];

                    foreach (var kvp in ClusterMap.Where(kvp => kvp.Value == clusterEntity.Key))
                    {
                        var docIndex    = kvp.Key;
                        var jobDocument = jobDocs[docIndex];

                        if (jobDocument != null)
                        {
                            clusterJobDocumentEntities.Add(new ClusterJobDocument()
                            {
                                ClusterCalculationId = clusterCalculationEntity.Id,
                                ClusterId            = clusterEntity.Value.Id,
                                JobId         = JobId,
                                Si            = DocumentSi.ContainsKey(docIndex) ? DocumentSi[docIndex] : 0,
                                JobDocumentId = jobDocument.First().Id
                            });
                        }
                    }

                    for (var i = 0; i < LSA.MatrixContainer.UMatrix.RowCount; i++)
                    {
                        termDistanceMap[LSA.MatrixContainer.Terms[i]] = Distance.Cosine(centerVector, LSA.MatrixContainer.UMatrix.Row(i).ToArray());
                    }

                    foreach (var term in termDistanceMap.OrderBy(t => t.Value).Take(20))
                    {
                        var jobTermLookup = jobTerms[term.Key];

                        if (jobTermLookup != null)
                        {
                            clusterJobTermEntities.Add(new ClusterJobTerm()
                            {
                                ClusterCalculationId = clusterCalculationEntity.Id,
                                ClusterId            = clusterEntity.Value.Id,
                                JobId     = JobId,
                                JobTermId = jobTermLookup.First().Id,
                                DistanceToClusterCenter = term.Value
                            });
                        }
                    }
                }
            });

            // Insert Cluster Documents & Terms
            context.BulkInsert(clusterJobTermEntities);
            context.BulkInsert(clusterJobDocumentEntities);

            SetCalculationStatus(context, clusterCalculationEntity, Contracts.ClusterCalculationStatus.Completed);
        }