private ComputeAttributeEntropyResult ComputeAttributeEntropyNotNumeric(IList <int> dataRowsIndexes, int attributeIndex) { var freq = new Dictionary <object, IList <int> >(); var itemCount = 0; var ret = new ComputeAttributeEntropyResult { AttributeIndex = attributeIndex, IsNumeric = false }; foreach (var rowIndex in dataRowsIndexes) { var currentValue = _data[rowIndex, attributeIndex]; if (IsUnknown(currentValue)) { continue; } if (!freq.ContainsKey(currentValue)) { freq.Add(currentValue, new List <int>()); } freq[currentValue].Add(rowIndex); itemCount++; } ret.KnownValues = itemCount; var sum = 0.0; foreach (var item in freq) { var list = item.Value; var statistics = ComputeStatistics(list); var branchProbability = list.Count / (double)itemCount; sum += branchProbability * statistics.Entropy; if (branchProbability > 0) { ret.SplitInformation -= branchProbability * Math.Log(branchProbability, 2); } } ret.Subsets = new Lazy <IEnumerable <ComputeAttributeEntropyResult.Subset> >( () => freq.Select( item => new ComputeAttributeEntropyResult.Subset { Rows = item.Value, Value = item.Key })); ret.EntropyValue = sum; return(ret); }
private ComputeAttributeEntropyResult ComputeAttributeEntropyNotNumeric(IList<IDataRow> dataRows, string attribute) { var freq = new Dictionary<object, IList<IDataRow>>(); var itemCount = 0; var ret = new ComputeAttributeEntropyResult {Attribute = attribute, IsNumeric = false}; foreach (var row in dataRows) { var currentValue = row[attribute]; if (IsUnknown(currentValue)) { continue; } if (!freq.ContainsKey(currentValue)) { freq.Add(currentValue, new List<IDataRow>()); } freq[currentValue].Add(row); itemCount++; } ret.KnownValues = itemCount; var sum = 0.0; foreach (var item in freq) { var list = item.Value; bool sameClass; sum += (((double) list.Count)/itemCount)*ComputeEntropy(list, out sameClass); } ret.Subsets = freq.Select(item => new ComputeAttributeEntropyResult.Subset {Rows = item.Value, Value = item.Key}); ret.EntropyValue = sum; return ret; }
private ComputeAttributeEntropyResult ComputeAttributeEntropyNumericBinary(IList<IDataRow> dataRows, string attribute) { var rows = dataRows.Where(item => !IsUnknown(item[attribute])).OrderBy(item => { double value; if (!item[attribute].TryConvertToNumeric(out value)) { value = Double.NaN; } return value; }).ToList(); var ret = new ComputeAttributeEntropyResult { IsNumeric = true, Attribute = attribute, KnownValues = rows.Count }; var left = new List<IDataRow>(); var right = new List<IDataRow>(rows); var rowsCount = rows.Count; var minimumAttributeValue = Double.MaxValue; int minsplitIndex = 0; var freqLeft = new Dictionary<string, int>(); var freqRight = new Dictionary<string, int>(); foreach (var row in right) { if (!freqRight.ContainsKey(row.Class)) { freqRight.Add(row.Class, 0); } freqRight[row.Class]++; } for (int index = 0; index < rowsCount - 1; index++) { var currentItem = rows[index]; var nextItem = rows[index + 1]; left.Add(currentItem); right.RemoveAt(0); #region LeftPart calculation if (!freqLeft.ContainsKey(currentItem.Class)) { freqLeft.Add(currentItem.Class, 0); } freqLeft[currentItem.Class]++; #endregion #region RightPart calculation freqRight[currentItem.Class]--; #endregion //if (currentItem.Class == nextItem.Class) // //breakpoints between values of the same class cannot be optimal //{ // continue; //} double leftItemValue; if (!currentItem[attribute].TryConvertToNumeric(out leftItemValue)) { leftItemValue = Double.NaN; } double rightItemValue; if (!nextItem[attribute].TryConvertToNumeric(out rightItemValue)) { rightItemValue = Double.NaN; } if (Math.Abs(leftItemValue - rightItemValue) < double.Epsilon) { continue; } var sumLeft = 0.0; var sumRight = 0.0; foreach (var item in freqLeft) { var val = ((double) item.Value)/left.Count; sumLeft += val*Math.Log(val, 2); } foreach (var item in freqRight) { if (item.Value == 0) { continue; } var val = ((double) item.Value)/right.Count; sumRight += val*Math.Log(val, 2); } var leftValue = (((double) left.Count)/rowsCount)*(-sumLeft); var rightValue = (((double) right.Count)/rowsCount)*(-sumRight); var currentAttributeValue = leftValue + rightValue; if (currentAttributeValue < minimumAttributeValue) { minsplitIndex = index; minimumAttributeValue = currentAttributeValue; } } double splitValue; left[minsplitIndex][attribute].TryConvertToNumeric(out splitValue); ret.EntropyValue = minimumAttributeValue; ret.Subsets = rows.Split(minsplitIndex) .Select(item => new ComputeAttributeEntropyResult.Subset {Rows = item, Value = splitValue}); return ret; }
private ComputeAttributeEntropyResult ComputeAttributeEntropyNumericBinary(IList<int> dataRowsIndexes, int attributeIndex) { var rows = dataRowsIndexes.Where(item => !IsUnknown(_data[item, attributeIndex])).OrderBy(item => { double value; if (!_data[item, attributeIndex].TryConvertToNumeric(out value)) { value = Double.NaN; } return value; }).ToArray(); var ret = new ComputeAttributeEntropyResult { IsNumeric = true, AttributeIndex = attributeIndex, KnownValues = rows.Length }; var rowsCount = rows.Length; var minimumAttributeValue = Double.MaxValue; int minsplitIndex = 0; var freqLeft = new int[_data.ClassesValue.Length]; var freqRight = new int[_data.ClassesValue.Length]; var rightListVal = new List<int>(); var leftListVal = new List<int>(); foreach (var rowIndex in rows) { var classVal = _data.Class(rowIndex); if (freqRight[classVal] == 0) { rightListVal.Add(classVal); } freqRight[classVal]++; } var leftCount = 0; var rightCount = rows.Length; for (int index = 0; index < rowsCount - 1; index++) { var currentItemClass = _data.Class(rows[index]); var nextItemClass = _data.Class(rows[index + 1]); var currentItemValue = _data[rows[index], attributeIndex]; var nextItemValue = _data[rows[index + 1], attributeIndex]; leftCount++; rightCount--; #region LeftPart calculation if (freqLeft[currentItemClass] == 0) { leftListVal.Add(currentItemClass); } freqLeft[currentItemClass]++; #endregion #region RightPart calculation freqRight[currentItemClass]--; #endregion if (currentItemClass == nextItemClass) //breakpoints between values of the same class cannot be optimal { continue; } double leftItemValue; if (!currentItemValue.TryConvertToNumeric(out leftItemValue)) { leftItemValue = Double.NaN; } double rightItemValue; if (!nextItemValue.TryConvertToNumeric(out rightItemValue)) { rightItemValue = Double.NaN; } if (leftItemValue == rightItemValue) { continue; } var sumLeft = 0.0; var sumRight = 0.0; foreach (var item in leftListVal) { var val = ((double)freqLeft[item]) / leftCount; sumLeft += val * Math.Log(val, 2); } foreach (var item in rightListVal) { if (freqRight[item] == 0) { continue; } var val = ((double)freqRight[item]) / rightCount; sumRight += val * Math.Log(val, 2); } var leftValue = (((double)leftCount) / rowsCount) * (-sumLeft); var rightValue = (((double)rightCount) / rowsCount) * (-sumRight); var currentAttributeValue = leftValue + rightValue; if (currentAttributeValue < minimumAttributeValue) { minsplitIndex = index; minimumAttributeValue = currentAttributeValue; } } double splitValue; _data[rows[minsplitIndex], attributeIndex].TryConvertToNumeric(out splitValue); ret.EntropyValue = minimumAttributeValue; ret.Subsets = rows.Split(minsplitIndex) .Select(item => new ComputeAttributeEntropyResult.Subset { Rows = item, Value = splitValue }); return ret; }
private ComputeAttributeEntropyResult ComputeAttributeEntropyNumericBinary(int[] dataRowsIndexes, int attributeIndex) { //var rows = dataRowsIndexes.Where(item => !IsUnknown(_data[item, attributeIndex])).OrderBy(item => (double) _data[item, attributeIndex]) // .ToArray(); var rows = new int[dataRowsIndexes.Length]; Buffer.BlockCopy(dataRowsIndexes, 0, rows, 0, dataRowsIndexes.Length * sizeof(int)); Array.Sort(rows, new ComparerAttr(_data, attributeIndex)); var ret = new ComputeAttributeEntropyResult { IsNumeric = true, AttributeIndex = attributeIndex, KnownValues = rows.Length }; var minimumAttributeValue = Double.MaxValue; int minsplitIndex = 0; var freqLeft = new int[_data.ClassesValue.Length]; var freqRight = new int[_data.ClassesValue.Length]; var rightListVal = new List <int>(); var leftListVal = new List <int>(); var rowsCount = rows.Length; foreach (var rowIndex in rows) { var classVal = _data.Class(rowIndex); if (freqRight[classVal] == 0) { rightListVal.Add(classVal); } freqRight[classVal]++; } var leftCount = 0; var rightCount = rows.Length; for (int index = 0; index < rowsCount - 1; index++) { var currentItemClass = _data.Class(rows[index]); var currentItemValue = _data[rows[index], attributeIndex]; var nextItemValue = _data[rows[index + 1], attributeIndex]; leftCount++; rightCount--; #region LeftPart calculation if (freqLeft[currentItemClass] == 0) { leftListVal.Add(currentItemClass); } freqLeft[currentItemClass]++; #endregion #region RightPart calculation freqRight[currentItemClass]--; #endregion if (currentItemValue.Equals(nextItemValue)) { continue; } var sumLeft = 0.0; var sumRight = 0.0; foreach (var item in leftListVal) { var val = ((double)freqLeft[item]) / leftCount; sumLeft += val * Math.Log(val, 2); } foreach (var item in rightListVal) { if (freqRight[item] == 0) { continue; } var val = ((double)freqRight[item]) / rightCount; sumRight += val * Math.Log(val, 2); } var leftProb = leftCount / (double)rowsCount; var rightProb = rightCount / (double)rowsCount; var leftValue = leftProb * (-sumLeft); var rightValue = rightProb * (-sumRight); var currentAttributeValue = leftValue + rightValue; if (currentAttributeValue < minimumAttributeValue) { minsplitIndex = index; minimumAttributeValue = currentAttributeValue; ret.SplitInformation = -(leftProb * Math.Log(leftProb, 2) + rightProb * Math.Log(rightProb, 2)); } } double splitValue; _data[rows[minsplitIndex], attributeIndex].TryConvertToNumeric(out splitValue); ret.EntropyValue = minimumAttributeValue; ret.Subsets = new Lazy <IEnumerable <ComputeAttributeEntropyResult.Subset> >( () => rows.Split(minsplitIndex) .Select(item => new ComputeAttributeEntropyResult.Subset { Rows = item, Value = splitValue })); return(ret); }
private Tuple <double, ComputeAttributeEntropyResult, Statistics> ComputeBestGain(int[] dataRowsIndexes, IList <int> attributesIndexes) { var statistics = ComputeStatistics(dataRowsIndexes); if (statistics.SameClass) { return(new Tuple <double, ComputeAttributeEntropyResult, Statistics>(double.MaxValue, null, statistics)); } var maxGainRatio = Double.MinValue; ComputeAttributeEntropyResult minAttributeEntropyResult = null; var collectionPartitioner = Partitioner.Create(0, attributesIndexes.Count); var locker = new object(); Parallel.ForEach(collectionPartitioner, (range, loopState) => { for (int i = range.Item1; i < range.Item2; i++) { var attribute = attributesIndexes[i]; var attributeEntropy = ComputeAttributeEntropy(dataRowsIndexes, attribute); var gain = (statistics.Entropy - attributeEntropy.EntropyValue) * attributeEntropy.KnownValues / dataRowsIndexes.Length; var gainRatio = gain / attributeEntropy.SplitInformation; lock (locker) { if (minAttributeEntropyResult == null) { minAttributeEntropyResult = attributeEntropy; } if (gainRatio > maxGainRatio) { maxGainRatio = gainRatio; minAttributeEntropyResult = attributeEntropy; } else if (minAttributeEntropyResult != attributeEntropy) { attributeEntropy.Subsets = null; } else if (Double.IsNegativeInfinity(gain)) { minAttributeEntropyResult.InvalidAttributes.Add(attribute); } } } }); //for (int i = 0; i < attributesIndexes.Count; i++) //{ // var attribute = attributesIndexes[i]; // var attributeEntropy = ComputeAttributeEntropy(dataRowsIndexes, attribute); // var gain = (statistics.Entropy - attributeEntropy.EntropyValue) * attributeEntropy.KnownValues / // dataRowsIndexes.Length; // var gainRatio = gain / attributeEntropy.SplitInformation; // lock (locker) // { // if (minAttributeEntropyResult == null) // { // minAttributeEntropyResult = attributeEntropy; // } // if (gainRatio > maxGainRatio) // { // maxGainRatio = gainRatio; // minAttributeEntropyResult = attributeEntropy; // } // else if (minAttributeEntropyResult != attributeEntropy) // { // attributeEntropy.Subsets = null; // } // else if (Double.IsNegativeInfinity(gain)) // { // minAttributeEntropyResult.InvalidAttributes.Add(attribute); // } // } //} return(new Tuple <double, ComputeAttributeEntropyResult, Statistics>(maxGainRatio, minAttributeEntropyResult, statistics)); }