public RadiusMergeCandidate( Classification <UnsignedPoint, string> clusters, string label1, ClusterRadius radius1, string label2, ClusterRadius radius2 ) { Label1 = label1; Point1 = clusters.PointsInClass(Label1).First(); Label2 = label2; Point2 = clusters.PointsInClass(Label2).First(); CombinedRadius = new ClusterRadius(clusters.PointsInClass(Label1), clusters.PointsInClass(Label2)); Shrinkage = CombinedRadius.Shrinkage(radius1, radius2); }
/// <summary> /// Load data directly, not via a file. /// </summary> /// <param name="initialClassification">Initial classification.</param> /// <param name="originalOrder">Original order. If omitted, an arbitrary ordering of points will be defined.</param> public void LoadData(Classification <UnsignedPoint, string> initialClassification, IList <UnsignedPoint> originalOrder = null) { InitialClassification = initialClassification; if (originalOrder == null) { InputOrder = InitialClassification.Points().ToList(); } else { InputOrder = originalOrder.ToList(); } InputDataIds = new Dictionary <UnsignedPoint, string>(); foreach (var point in InitialClassification.Points()) { InputDataIds[point] = point.UniqueId.ToString(); } }
/// <summary> /// Create an index of all the points in a Classification, optionally adding a new dimension to each point to hold /// that point's classification index. /// </summary> /// <param name="clusters">Clusters of points, which could be UnsignedPoints or HilbertPoints.</param> /// <param name="bitsPerDimension">Bits per dimension to use when transforming UnsignedPoints into HilbertPoints, /// should that be necessary. /// If a non-positive number, compute the value by studying the data, using the smallest number capable of accommodating /// the largest coordinate values.</param> /// <param name="addClassificationDimension">If set to <c>true</c> add a classification dimension to the end of each point. /// The value will be the index of that point's cluster. Cluster ordering is arbitrary and dependent on the order that /// the set Classification.LabelToPoints.Values iterates over them.</param> public HilbertIndex(Classification <UnsignedPoint, string> clusters, int bitsPerDimension = 0, bool addClassificationDimension = false) { if (bitsPerDimension <= 0) { bitsPerDimension = FindBitsPerDimension(clusters.Points()); } UnsortedPoints = new List <HilbertPoint>(); foreach (var clusterWithNumber in clusters.LabelToPoints.Values.Select((c, i) => new { Cluster = c, Index = (uint)i })) { UnsortedPoints.AddRange( clusterWithNumber.Cluster .Select(p => addClassificationDimension ? p.AppendCoordinate(clusterWithNumber.Index) : p) .Select(p => HilbertPoint.CastOrConvert(p, bitsPerDimension, true)) ); } InitIndexing(); }
/// <summary> /// Compares two classifications to see how many clusters in this partition are identical to corresponding clusters in /// the second partition. /// /// This measure is not as good as BCubed for getting a true picture of how similar are two Classifications, /// but it is much faster. It is best used to decide if two Classifications are exactly the same or not. /// /// For example, if there were 10,000 points in 100 clusters, and each cluster had one point missing, /// then no clusters would match perfectly, giving a score of zero, when in fact, that would yield /// a very good BCubed score. However, if this returns a number equal to the total number of clusters in this /// Classification, then the partitions are identical and BCubed would also equal one. /// </summary> /// <returns>Count of perfectly matching clusters.</returns> /// <param name="alternatePartition">Alternate partition to compare.</param> public int IdenticalClusters(Classification <TPoint, TLabel> alternatePartition) { var identicalCount = 0; var clusterHashes = new HashSet <int>(); // Record hashes for each of "this" Classification's clusters. foreach (var pointSet in LabelToPoints.Values) { var hash = SetHash(pointSet.Select(p => p.GetHashCode())); clusterHashes.Add(hash); } // Attempt to match hashes for the alternatePartition's clusters to the ones just recorded above. foreach (var pointSet in alternatePartition.LabelToPoints.Values) { var hash = SetHash(pointSet.Select(p => p.GetHashCode())); if (clusterHashes.Contains(hash)) { identicalCount++; } } return(identicalCount); }
/// <summary> /// Counts how many distinct classes in the goldStandard are represented by points from the given class in this Classification. /// </summary> /// <param name="goldStandard">A second Classification to use as a benchmark for comparison. /// It is not assumed that the two Classifications use the same labeling scheme.</param> /// <param name="classLabel">Identifies a class in this Classification (NOT in the goldStandard).</param> /// <returns>One if all points in the class labeled with classLabel from this Classification are in the same class in goldStandard. /// Otherwise, it returns the number of distinct classes that the points are drawn from in goldStandard. /// A perfect clustering scheme will always return one.</returns> public int Homogeneity(Classification <TPoint, TLabel> goldStandard, TLabel classLabel) { return(LabelToPoints[classLabel].Select(goldStandard.GetClassLabel).Distinct().Count()); }
/// <summary> /// Load the data from a file or standard input unless it has already been loaded into InitialClassification. /// /// It must have a header row if Configuration.Data.ReadHeader is true. /// Field values may be separated by commas or tabs. /// </summary> void LoadData() { if (IsDataLoaded) { return; } Timer.Start("Load data"); InitialClassification = new Classification <UnsignedPoint, string>(); InputOrder = new List <UnsignedPoint>(); InputDataIds = new Dictionary <UnsignedPoint, string>(); IEnumerable <string> lines; if (Configuration.Data.ReadFromStandardIn()) { lines = ReadLinesFromConsole(); } else { lines = File.ReadLines(Configuration.Data.InputDataFile); } var idPosition = -1; var categoryPosition = -1; if (!Configuration.Data.ReadHeader) { idPosition = SafeParseInt(Configuration.Data.IdField, -1); categoryPosition = SafeParseInt(Configuration.Data.CategoryField, -1); } var rownum = Configuration.Data.ReadHeader ? 0 : 1; string[] header = null; foreach (var values in lines.Select(line => line.Split(new [] { ',', '\t' }))) { // Skip blank lines and comments if (values.Length < 2) { continue; } if (rownum == 0 && Configuration.Data.ReadHeader) { // Identify which columns hold the Id and the Category, if any. // If no column holds the Id, then the one-based row number will be used. // Regardless of whether the file has a header row, row number one // is the first row with data, not column headings. header = values; var tryIdPosition = Array.FindIndex( header, heading => heading.ToUpperInvariant().Equals(Configuration.Data.IdField.ToUpperInvariant()) ); if (tryIdPosition != -1) { idPosition = tryIdPosition; } var tryCategoryPosition = Array.FindIndex( header, heading => heading.ToUpperInvariant().Equals(Configuration.Data.CategoryField.ToUpperInvariant()) ); if (tryCategoryPosition != -1) { categoryPosition = tryCategoryPosition; } } else { int id; string idString; if (idPosition == -1) { id = rownum; idString = rownum.ToString(); } else { // If the id is not a number, we use the rownum in the points we create as the id, but // make a correspondence between the string and the point. id = SafeParseInt(values[idPosition], rownum); idString = values[idPosition]; } string categoryString; if (categoryPosition == -1) { categoryString = rownum.ToString(); // Unclassified - all points in their own cluster. } else { categoryString = values[categoryPosition]; } var coordinates = new List <uint>(); foreach (var pair in values.Select((v, i) => new { Value = v, Position = i })) { //TODO: Reject negative values and log. if (pair.Position != idPosition && pair.Position != categoryPosition) { coordinates.Add((uint)SafeParseInt(pair.Value, 0)); } } var point = new UnsignedPoint(coordinates, id); //TODO: Check for duplicate ids and log. InputDataIds[point] = idString; InitialClassification.Add(point, categoryString); InputOrder.Add(point); } rownum++; } Timer.Stop("Load data"); }
/// <summary> /// Apply Density-based reclassification to the FinalClassification. /// This may cause some clusters to be split into smaller clusters. /// It will not cause any existing clusters to be merged. /// </summary> void ReclassifyByDensity() { // 0. Decide if we will be doing this or not, based on the configuration. if (!Configuration.DensityClassifier.SkipDensityClassification) { Timer.Start("Reclassify by density"); var numberOfClustersSplit = 0; // 1. Loop through all clusters in FinalClassification // We will be modifying FinalClassification while iterating over it, // so we need to copy the list of labels up front. var classLabels = FinalClassification.ClassLabels().ToList(); foreach (var clusterId in classLabels) { // 2. Decide if the cluster needs reclustering. if (NeedsReclustering(clusterId)) { // 3. Obtain the members of the cluster and index them by the Hilbert curve var pointsToClassify = FinalClassification.PointsInClass(clusterId); var lookupPointById = new Dictionary <int, UnsignedPoint>(); foreach (var p in pointsToClassify) { lookupPointById[p.UniqueId] = p; } int labelCounter = 1; var subClassification = new Classification <UnsignedPoint, string>(pointsToClassify, p => (labelCounter++).ToString()); var hIndex = new HilbertIndex(subClassification, Configuration.Index.BitsPerDimension); // 4. Create a DensityClassifier, properly configured. var unmergeableSize = (int)(pointsToClassify.Count * Configuration.DensityClassifier.UnmergeableSizeFraction); var densityClassifier = new DensityClassifier(hIndex, MergeSquareDistance, unmergeableSize) { NeighborhoodRadiusMultiplier = Configuration.DensityClassifier.NeighborhoodRadiusMultiplier, OutlierSize = Configuration.DensityClassifier.OutlierSize, MergeableShrinkage = Configuration.DensityClassifier.MergeableShrinkage }; // 5. Reclassify. // This classification is in terms of HilbertPoints, so afterwards we will need to map them to // their non-HilbertPoint, original UnsignedPoints. var densityClassification = densityClassifier.Classify(); // 6. If the number of clusters made from the points is more than one... if (densityClassification.NumPartitions > 1) { numberOfClustersSplit++; // 7. ... loop through all HilbertPoints from cluster and find corresponding UnsignedPoints. foreach (var hPoint in densityClassification.Points()) { var uPoint = lookupPointById[hPoint.UniqueId]; // Form the new class label by appending the previous label and the density-based label. var previousClassLabel = FinalClassification.GetClassLabel(uPoint); var densityClassLabel = densityClassification.GetClassLabel(hPoint); var newClassLabel = $"{previousClassLabel}-{densityClassLabel}"; // 8. Pull point from its current cluster and add it to a new cluster FinalClassification.Remove(uPoint); FinalClassification.Add(uPoint, newClassLabel); } } } } Timer.Stop("Reclassify by density"); Logger.Info($"Clusters split due to density-based reclassification: {numberOfClustersSplit}"); } }
/// <summary> /// Initializes a new instance of the ClosestCluster class. /// </summary> /// <param name="clusters">Classifies the points into distinct clusters.</param> public ClosestCluster(Classification <UnsignedPoint, TLabel> clusters) { Clusters = clusters; }
/// <summary> /// Since categories may have been merged since this was created, update Color1 and Color2 /// to reflect the current categorization of Point1 and Point2. /// </summary> /// <param name="clusters">Current clustering of points.</param> public ClosestPair Relabel(Classification <UnsignedPoint, TLabel> clusters) { Color1 = clusters.GetClassLabel(Point1); Color2 = clusters.GetClassLabel(Point2); return(this); }