public RadiusMergeCandidate(
     Classification <UnsignedPoint, string> clusters,
     string label1,
     ClusterRadius radius1,
     string label2,
     ClusterRadius radius2
     )
 {
     Label1         = label1;
     Point1         = clusters.PointsInClass(Label1).First();
     Label2         = label2;
     Point2         = clusters.PointsInClass(Label2).First();
     CombinedRadius = new ClusterRadius(clusters.PointsInClass(Label1), clusters.PointsInClass(Label2));
     Shrinkage      = CombinedRadius.Shrinkage(radius1, radius2);
 }
Ejemplo n.º 2
0
 /// <summary>
 /// Load data directly, not via a file.
 /// </summary>
 /// <param name="initialClassification">Initial classification.</param>
 /// <param name="originalOrder">Original order. If omitted, an arbitrary ordering of points will be defined.</param>
 public void LoadData(Classification <UnsignedPoint, string> initialClassification, IList <UnsignedPoint> originalOrder = null)
 {
     InitialClassification = initialClassification;
     if (originalOrder == null)
     {
         InputOrder = InitialClassification.Points().ToList();
     }
     else
     {
         InputOrder = originalOrder.ToList();
     }
     InputDataIds = new Dictionary <UnsignedPoint, string>();
     foreach (var point in InitialClassification.Points())
     {
         InputDataIds[point] = point.UniqueId.ToString();
     }
 }
Ejemplo n.º 3
0
        /// <summary>
        /// Create an index of all the points in a Classification, optionally adding a new dimension to each point to hold
        /// that point's classification index.
        /// </summary>
        /// <param name="clusters">Clusters of points, which could be UnsignedPoints or HilbertPoints.</param>
        /// <param name="bitsPerDimension">Bits per dimension to use when transforming UnsignedPoints into HilbertPoints,
        /// should that be necessary.
        /// If a non-positive number, compute the value by studying the data, using the smallest number capable of accommodating
        /// the largest coordinate values.</param>
        /// <param name="addClassificationDimension">If set to <c>true</c> add a classification dimension to the end of each point.
        /// The value will be the index of that point's cluster. Cluster ordering is arbitrary and dependent on the order that
        /// the set Classification.LabelToPoints.Values iterates over them.</param>
        public HilbertIndex(Classification <UnsignedPoint, string> clusters, int bitsPerDimension = 0, bool addClassificationDimension = false)
        {
            if (bitsPerDimension <= 0)
            {
                bitsPerDimension = FindBitsPerDimension(clusters.Points());
            }

            UnsortedPoints = new List <HilbertPoint>();
            foreach (var clusterWithNumber in clusters.LabelToPoints.Values.Select((c, i) => new { Cluster = c, Index = (uint)i }))
            {
                UnsortedPoints.AddRange(
                    clusterWithNumber.Cluster
                    .Select(p => addClassificationDimension ? p.AppendCoordinate(clusterWithNumber.Index) : p)
                    .Select(p => HilbertPoint.CastOrConvert(p, bitsPerDimension, true))
                    );
            }
            InitIndexing();
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Compares two classifications to see how many clusters in this partition are identical to corresponding clusters in
        /// the second partition.
        ///
        /// This measure is not as good as BCubed for getting a true picture of how similar are two Classifications,
        /// but it is much faster. It is best used to decide if two Classifications are exactly the same or not.
        ///
        /// For example, if there were 10,000 points in 100 clusters, and each cluster had one point missing,
        /// then no clusters would match perfectly, giving a score of zero, when in fact, that would yield
        /// a very good BCubed score. However, if this returns a number equal to the total number of clusters in this
        /// Classification, then the partitions are identical and BCubed would also equal one.
        /// </summary>
        /// <returns>Count of perfectly matching clusters.</returns>
        /// <param name="alternatePartition">Alternate partition to compare.</param>
        public int IdenticalClusters(Classification <TPoint, TLabel> alternatePartition)
        {
            var identicalCount = 0;
            var clusterHashes  = new HashSet <int>();

            // Record hashes for each of "this" Classification's clusters.
            foreach (var pointSet in LabelToPoints.Values)
            {
                var hash = SetHash(pointSet.Select(p => p.GetHashCode()));
                clusterHashes.Add(hash);
            }
            // Attempt to match hashes for the alternatePartition's clusters to the ones just recorded above.
            foreach (var pointSet in alternatePartition.LabelToPoints.Values)
            {
                var hash = SetHash(pointSet.Select(p => p.GetHashCode()));
                if (clusterHashes.Contains(hash))
                {
                    identicalCount++;
                }
            }
            return(identicalCount);
        }
Ejemplo n.º 5
0
 /// <summary>
 /// Counts how many distinct classes in the goldStandard are represented by points from the given class in this Classification.
 /// </summary>
 /// <param name="goldStandard">A second Classification to use as a benchmark for comparison.
 /// It is not assumed that the two Classifications use the same labeling scheme.</param>
 /// <param name="classLabel">Identifies a class in this Classification (NOT in the goldStandard).</param>
 /// <returns>One if all points in the class labeled with classLabel from this Classification are in the same class in goldStandard.
 /// Otherwise, it returns the number of distinct classes that the points are drawn from in goldStandard.
 /// A perfect clustering scheme will always return one.</returns>
 public int Homogeneity(Classification <TPoint, TLabel> goldStandard, TLabel classLabel)
 {
     return(LabelToPoints[classLabel].Select(goldStandard.GetClassLabel).Distinct().Count());
 }
Ejemplo n.º 6
0
        /// <summary>
        /// Load the data from a file or standard input unless it has already been loaded into InitialClassification.
        ///
        /// It must have a header row if Configuration.Data.ReadHeader is true.
        /// Field values may be separated by commas or tabs.
        /// </summary>
        void LoadData()
        {
            if (IsDataLoaded)
            {
                return;
            }
            Timer.Start("Load data");
            InitialClassification = new Classification <UnsignedPoint, string>();
            InputOrder            = new List <UnsignedPoint>();
            InputDataIds          = new Dictionary <UnsignedPoint, string>();
            IEnumerable <string> lines;

            if (Configuration.Data.ReadFromStandardIn())
            {
                lines = ReadLinesFromConsole();
            }
            else
            {
                lines = File.ReadLines(Configuration.Data.InputDataFile);
            }
            var idPosition       = -1;
            var categoryPosition = -1;

            if (!Configuration.Data.ReadHeader)
            {
                idPosition       = SafeParseInt(Configuration.Data.IdField, -1);
                categoryPosition = SafeParseInt(Configuration.Data.CategoryField, -1);
            }
            var rownum = Configuration.Data.ReadHeader ? 0 : 1;

            string[] header = null;
            foreach (var values in lines.Select(line => line.Split(new [] { ',', '\t' })))
            {
                // Skip blank lines and comments
                if (values.Length < 2)
                {
                    continue;
                }
                if (rownum == 0 && Configuration.Data.ReadHeader)
                {
                    // Identify which columns hold the Id and the Category, if any.
                    // If no column holds the Id, then the one-based row number will be used.
                    // Regardless of whether the file has a header row, row number one
                    // is the first row with data, not column headings.
                    header = values;
                    var tryIdPosition = Array.FindIndex(
                        header,
                        heading => heading.ToUpperInvariant().Equals(Configuration.Data.IdField.ToUpperInvariant())
                        );
                    if (tryIdPosition != -1)
                    {
                        idPosition = tryIdPosition;
                    }
                    var tryCategoryPosition = Array.FindIndex(
                        header,
                        heading => heading.ToUpperInvariant().Equals(Configuration.Data.CategoryField.ToUpperInvariant())
                        );
                    if (tryCategoryPosition != -1)
                    {
                        categoryPosition = tryCategoryPosition;
                    }
                }
                else
                {
                    int    id;
                    string idString;
                    if (idPosition == -1)
                    {
                        id       = rownum;
                        idString = rownum.ToString();
                    }
                    else
                    {
                        // If the id is not a number, we use the rownum in the points we create as the id, but
                        // make a correspondence between the string and the point.
                        id       = SafeParseInt(values[idPosition], rownum);
                        idString = values[idPosition];
                    }
                    string categoryString;
                    if (categoryPosition == -1)
                    {
                        categoryString = rownum.ToString();                         // Unclassified - all points in their own cluster.
                    }
                    else
                    {
                        categoryString = values[categoryPosition];
                    }

                    var coordinates = new List <uint>();
                    foreach (var pair in values.Select((v, i) => new { Value = v, Position = i }))
                    {
                        //TODO: Reject negative values and log.
                        if (pair.Position != idPosition && pair.Position != categoryPosition)
                        {
                            coordinates.Add((uint)SafeParseInt(pair.Value, 0));
                        }
                    }
                    var point = new UnsignedPoint(coordinates, id);
                    //TODO: Check for duplicate ids and log.
                    InputDataIds[point] = idString;
                    InitialClassification.Add(point, categoryString);
                    InputOrder.Add(point);
                }
                rownum++;
            }
            Timer.Stop("Load data");
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Apply Density-based reclassification to the FinalClassification.
        /// This may cause some clusters to be split into smaller clusters.
        /// It will not cause any existing clusters to be merged.
        /// </summary>
        void ReclassifyByDensity()
        {
            // 0. Decide if we will be doing this or not, based on the configuration.
            if (!Configuration.DensityClassifier.SkipDensityClassification)
            {
                Timer.Start("Reclassify by density");
                var numberOfClustersSplit = 0;

                // 1. Loop through all clusters in FinalClassification
                // We will be modifying FinalClassification while iterating over it,
                // so we need to copy the list of labels up front.
                var classLabels = FinalClassification.ClassLabels().ToList();
                foreach (var clusterId in classLabels)
                {
                    // 2. Decide if the cluster needs reclustering.
                    if (NeedsReclustering(clusterId))
                    {
                        // 3. Obtain the members of the cluster and index them by the Hilbert curve
                        var pointsToClassify = FinalClassification.PointsInClass(clusterId);
                        var lookupPointById  = new Dictionary <int, UnsignedPoint>();
                        foreach (var p in pointsToClassify)
                        {
                            lookupPointById[p.UniqueId] = p;
                        }
                        int labelCounter      = 1;
                        var subClassification = new Classification <UnsignedPoint, string>(pointsToClassify, p => (labelCounter++).ToString());
                        var hIndex            = new HilbertIndex(subClassification, Configuration.Index.BitsPerDimension);

                        // 4. Create a DensityClassifier, properly configured.
                        var unmergeableSize   = (int)(pointsToClassify.Count * Configuration.DensityClassifier.UnmergeableSizeFraction);
                        var densityClassifier = new DensityClassifier(hIndex, MergeSquareDistance, unmergeableSize)
                        {
                            NeighborhoodRadiusMultiplier = Configuration.DensityClassifier.NeighborhoodRadiusMultiplier,
                            OutlierSize        = Configuration.DensityClassifier.OutlierSize,
                            MergeableShrinkage = Configuration.DensityClassifier.MergeableShrinkage
                        };

                        // 5. Reclassify.
                        //    This classification is in terms of HilbertPoints, so afterwards we will need to map them to
                        //    their non-HilbertPoint, original UnsignedPoints.
                        var densityClassification = densityClassifier.Classify();

                        // 6. If the number of clusters made from the points is more than one...
                        if (densityClassification.NumPartitions > 1)
                        {
                            numberOfClustersSplit++;

                            // 7. ... loop through all HilbertPoints from cluster and find corresponding UnsignedPoints.
                            foreach (var hPoint in densityClassification.Points())
                            {
                                var uPoint = lookupPointById[hPoint.UniqueId];

                                // Form the new class label by appending the previous label and the density-based label.
                                var previousClassLabel = FinalClassification.GetClassLabel(uPoint);
                                var densityClassLabel  = densityClassification.GetClassLabel(hPoint);
                                var newClassLabel      = $"{previousClassLabel}-{densityClassLabel}";

                                // 8. Pull point from its current cluster and add it to a new cluster
                                FinalClassification.Remove(uPoint);
                                FinalClassification.Add(uPoint, newClassLabel);
                            }
                        }
                    }
                }
                Timer.Stop("Reclassify by density");
                Logger.Info($"Clusters split due to density-based reclassification: {numberOfClustersSplit}");
            }
        }
 /// <summary>
 /// Initializes a new instance of the ClosestCluster class.
 /// </summary>
 /// <param name="clusters">Classifies the points into distinct clusters.</param>
 public ClosestCluster(Classification <UnsignedPoint, TLabel> clusters)
 {
     Clusters = clusters;
 }
 /// <summary>
 /// Since categories may have been merged since this was created, update Color1 and Color2
 /// to reflect the current categorization of Point1 and Point2.
 /// </summary>
 /// <param name="clusters">Current clustering of points.</param>
 public ClosestPair Relabel(Classification <UnsignedPoint, TLabel> clusters)
 {
     Color1 = clusters.GetClassLabel(Point1);
     Color2 = clusters.GetClassLabel(Point2);
     return(this);
 }