/// <summary> /// Gets the blocks for a sequence of numerical and target data. /// </summary> /// <param name="numericalData">The numerical data.</param> /// <param name="targetData">The target data.</param> /// <returns> /// The collection of <see cref="NumericalBlock"/> instances /// corresponding to the specified numerical and target data.</returns> /// <exception cref="ArgumentNullException"> /// <paramref name="numericalData"/> is <b>null</b>.<br/> /// -or-<br/> /// <paramref name="targetData"/> is <b>null</b>. /// </exception> /// <exception cref="ArgumentException"> /// Parameter <paramref name="targetData"/> has not the /// same <see cref="DoubleMatrix.Count"/> of /// parameter <paramref name="numericalData"/>. /// </exception> public static List <NumericalBlock> GetNumericalBlocks( DoubleMatrix numericalData, DoubleMatrix targetData) { List <NumericalBin> bins = NumericalBin.GetNumericalBins(numericalData, targetData); List <NumericalBlock> blocks; // Numerical bins are numerical blocks if and only if // A. Their target distribution is heterogeneous // (i.e. there are two or more target values having nonzero frequency) // OR // not(A). Their class distribution is not heterogeneous // AND // B. No contiguous bins are not heterogeneous and share // the same target mode. int numberOfBins = bins.Count; blocks = new List <NumericalBlock>(numberOfBins); if (1 == numberOfBins) { blocks.Add(bins[0]); return(blocks); } var targetCodes = bins[0].targetFrequencyDistribution.Keys.ToArray(); for (int i = 0; i < numberOfBins; i++) { var currentBin = bins[i]; // The following is equivalent to condition not(A) if index i // is not the last index. // Otherwise, i.e. if index i reaches the last possible value, // which is numberOfBins - 1, // this implies that the last bin is a block, since it has not // been previously merged. bool reviseCurrentBinForMerging = !currentBin.IsTargetDistributionHeterogeneous; if (reviseCurrentBinForMerging) { // Here if not(A) is true: verify condition B. double referenceMode = currentBin.Mode; int j; int mergeLastIndex = currentBin.lastPosition; int mergeModeFrequency = currentBin.targetFrequencyDistribution[referenceMode]; // This will become true if and only if B holds true. bool mergeRequired = false; double mergeAttributeValue = currentBin.lastValue; for (j = i + 1; j < numberOfBins; j++) { NumericalBlock nextBin = bins[j]; if ((!nextBin.IsTargetDistributionHeterogeneous) && (nextBin.Mode == referenceMode)) { // Here if both currentBin and nextBin // are not heterogeneous // and they share the same class mode mergeRequired = true; mergeAttributeValue = nextBin.lastValue; mergeLastIndex = nextBin.lastPosition; mergeModeFrequency += nextBin.targetFrequencyDistribution[referenceMode]; } else // No subsequent bins are not heterogeneous and share // the same mode of the current bin { break; } } // End for if (mergeRequired) { i = j - 1; // Skip merged bins NumericalBlock mergedBlock = new ( currentBin.firstPosition, mergeAttributeValue, targetCodes) { firstValue = currentBin.firstValue, lastPosition = mergeLastIndex }; mergedBlock.targetFrequencyDistribution[referenceMode] = mergeModeFrequency; blocks.Add(mergedBlock); } else // The current bin is a block. { blocks.Add(currentBin); } } else // The current bin is a block. { blocks.Add(currentBin); } } return(blocks); }
/// <summary> /// Gets the bins for a sequence of numerical and target data. /// </summary> /// <param name="numericalData">The numerical data.</param> /// <param name="targetData">The target data.</param> /// <returns>The collection of <see cref="NumericalBin"/> instances /// corresponding to the specified numerical and target data.</returns> /// <exception cref="ArgumentNullException"> /// <paramref name="numericalData"/> is <b>null</b>. <br/> /// -or- <br/> /// <paramref name="targetData"/> is <b>null</b>. /// </exception> /// <exception cref="ArgumentException"> /// Parameter <paramref name="targetData"/> has not the /// same <see cref="DoubleMatrix.Count"/> of /// parameter <paramref name="numericalData"/>. /// </exception> public static List <NumericalBin> GetNumericalBins( DoubleMatrix numericalData, DoubleMatrix targetData) { #region Input validation if (numericalData is null) { throw new ArgumentNullException(nameof(numericalData)); } if (targetData is null) { throw new ArgumentNullException(nameof(targetData)); } if (numericalData.Count != targetData.Count) { throw new ArgumentException( string.Format( CultureInfo.InvariantCulture, ImplementationServices.GetResourceString( "STR_EXCEPT_PAR_MUST_HAVE_SAME_COUNT"), nameof(numericalData)), nameof(targetData)); } #endregion List <NumericalBin> bins; if (numericalData.Count == 1) { bins = new List <NumericalBin>(1); var bin = new NumericalBin( 0, numericalData[0], targetData.GetStorage()) { lastPosition = 0 }; bin.targetFrequencyDistribution[targetData[0]]++; bins.Add(bin); return(bins); } bins = new List <NumericalBin>(); // Identify boundary points SortIndexResults sortResults = Stat.SortIndex( numericalData, SortDirection.Ascending); var sortedAttributeData = sortResults.SortedData; var sortedClassData = targetData.Vec(sortResults.SortedIndexes); var targetCodes = sortedClassData.Distinct().OrderBy( (code) => { return(code); }).ToArray(); double currentClass, currentAttributeValue, nextAttributeValue = Double.NaN; int lastcycledPosition = sortedAttributeData.Count - 2; bool createBin = true; NumericalBin currentBin = null; // Create attribute bins (a bin is a collection of positions // in the attribute ordering which are occupied by a same // attribute value for (int i = 0; i < lastcycledPosition + 1; i++) { // Create a new bin if needed. currentAttributeValue = sortedAttributeData[i]; if (createBin) { currentBin = new NumericalBin( i, currentAttributeValue, targetCodes); createBin = false; } // Update the class distribution in the current bin. currentClass = sortedClassData[i]; currentBin.targetFrequencyDistribution[currentClass]++; int nextPosition = i + 1; nextAttributeValue = sortedAttributeData[nextPosition]; bool cutPointDetected = currentAttributeValue != nextAttributeValue; if (i < lastcycledPosition) { if (cutPointDetected) { currentBin.lastPosition = i; bins.Add(currentBin); createBin = true; } } else { // A cut point exists between the last two positions // (final cut point) if (cutPointDetected) { // Finalize the current bin currentBin.lastPosition = i; bins.Add(currentBin); // Add a last bin consisting of the last position currentBin = new NumericalBin( nextPosition, nextAttributeValue, targetCodes) { lastPosition = nextPosition }; currentBin.targetFrequencyDistribution[ sortedClassData[nextPosition]]++; bins.Add(currentBin); } else // No final cut point { currentBin.lastPosition = nextPosition; currentBin.targetFrequencyDistribution[ sortedClassData[nextPosition]]++; bins.Add(currentBin); } } } return(bins); }