private ComputeAttributeEntropyResult ComputeAttributeEntropyNotNumeric(IList <int> dataRowsIndexes,
                                                                                int attributeIndex)
        {
            var freq      = new Dictionary <object, IList <int> >();
            var itemCount = 0;

            var ret = new ComputeAttributeEntropyResult {
                AttributeIndex = attributeIndex, IsNumeric = false
            };

            foreach (var rowIndex in dataRowsIndexes)
            {
                var currentValue = _data[rowIndex, attributeIndex];
                if (IsUnknown(currentValue))
                {
                    continue;
                }
                if (!freq.ContainsKey(currentValue))
                {
                    freq.Add(currentValue, new List <int>());
                }
                freq[currentValue].Add(rowIndex);
                itemCount++;
            }
            ret.KnownValues = itemCount;

            var sum = 0.0;

            foreach (var item in freq)
            {
                var list              = item.Value;
                var statistics        = ComputeStatistics(list);
                var branchProbability = list.Count / (double)itemCount;
                sum += branchProbability * statistics.Entropy;
                if (branchProbability > 0)
                {
                    ret.SplitInformation -= branchProbability * Math.Log(branchProbability, 2);
                }
            }

            ret.Subsets =
                new Lazy <IEnumerable <ComputeAttributeEntropyResult.Subset> >(
                    () =>
                    freq.Select(
                        item => new ComputeAttributeEntropyResult.Subset {
                Rows = item.Value, Value = item.Key
            }));
            ret.EntropyValue = sum;

            return(ret);
        }
Пример #2
0
        private ComputeAttributeEntropyResult ComputeAttributeEntropyNotNumeric(IList<IDataRow> dataRows,
            string attribute)
        {
            var freq = new Dictionary<object, IList<IDataRow>>();
            var itemCount = 0;

            var ret = new ComputeAttributeEntropyResult {Attribute = attribute, IsNumeric = false};

            foreach (var row in dataRows)
            {
                var currentValue = row[attribute];
                if (IsUnknown(currentValue))
                {
                    continue;
                }
                if (!freq.ContainsKey(currentValue))
                {
                    freq.Add(currentValue, new List<IDataRow>());
                }
                freq[currentValue].Add(row);
                itemCount++;
            }
            ret.KnownValues = itemCount;

            var sum = 0.0;
            foreach (var item in freq)
            {
                var list = item.Value;
                bool sameClass;
                sum += (((double) list.Count)/itemCount)*ComputeEntropy(list, out sameClass);
            }

            ret.Subsets =
                freq.Select(item => new ComputeAttributeEntropyResult.Subset {Rows = item.Value, Value = item.Key});
            ret.EntropyValue = sum;

            return ret;
        }
Пример #3
0
        private ComputeAttributeEntropyResult ComputeAttributeEntropyNumericBinary(IList<IDataRow> dataRows,
            string attribute)
        {
            var rows = dataRows.Where(item => !IsUnknown(item[attribute])).OrderBy(item =>
            {
                double value;
                if (!item[attribute].TryConvertToNumeric(out value))
                {
                    value = Double.NaN;
                }
                return value;
            }).ToList();

            var ret = new ComputeAttributeEntropyResult
            {
                IsNumeric = true,
                Attribute = attribute,
                KnownValues = rows.Count
            };

            var left = new List<IDataRow>();
            var right = new List<IDataRow>(rows);

            var rowsCount = rows.Count;
            var minimumAttributeValue = Double.MaxValue;
            int minsplitIndex = 0;

            var freqLeft = new Dictionary<string, int>();
            var freqRight = new Dictionary<string, int>();

            foreach (var row in right)
            {
                if (!freqRight.ContainsKey(row.Class))
                {
                    freqRight.Add(row.Class, 0);
                }
                freqRight[row.Class]++;
            }

            for (int index = 0; index < rowsCount - 1; index++)
            {
                var currentItem = rows[index];
                var nextItem = rows[index + 1];
                left.Add(currentItem);
                right.RemoveAt(0);

                #region LeftPart calculation

                if (!freqLeft.ContainsKey(currentItem.Class))
                {
                    freqLeft.Add(currentItem.Class, 0);
                }

                freqLeft[currentItem.Class]++;

                #endregion

                #region RightPart calculation

                freqRight[currentItem.Class]--;

                #endregion

                //if (currentItem.Class == nextItem.Class)
                //    //breakpoints between values of the same class cannot be optimal
                //{
                //    continue;
                //}

                double leftItemValue;
                if (!currentItem[attribute].TryConvertToNumeric(out leftItemValue))
                {
                    leftItemValue = Double.NaN;
                }

                double rightItemValue;
                if (!nextItem[attribute].TryConvertToNumeric(out rightItemValue))
                {
                    rightItemValue = Double.NaN;
                }

                if (Math.Abs(leftItemValue - rightItemValue) < double.Epsilon)
                {
                    continue;
                }

                var sumLeft = 0.0;
                var sumRight = 0.0;

                foreach (var item in freqLeft)
                {
                    var val = ((double) item.Value)/left.Count;
                    sumLeft += val*Math.Log(val, 2);
                }

                foreach (var item in freqRight)
                {
                    if (item.Value == 0)
                    {
                        continue;
                    }
                    var val = ((double) item.Value)/right.Count;
                    sumRight += val*Math.Log(val, 2);
                }

                var leftValue = (((double) left.Count)/rowsCount)*(-sumLeft);
                var rightValue = (((double) right.Count)/rowsCount)*(-sumRight);

                var currentAttributeValue = leftValue + rightValue;
                if (currentAttributeValue < minimumAttributeValue)
                {
                    minsplitIndex = index;
                    minimumAttributeValue = currentAttributeValue;
                }
            }

            double splitValue;
            left[minsplitIndex][attribute].TryConvertToNumeric(out splitValue);

            ret.EntropyValue = minimumAttributeValue;
            ret.Subsets =
                rows.Split(minsplitIndex)
                    .Select(item => new ComputeAttributeEntropyResult.Subset {Rows = item, Value = splitValue});

            return ret;
        }
Пример #4
0
        private ComputeAttributeEntropyResult ComputeAttributeEntropyNumericBinary(IList<int> dataRowsIndexes,
            int attributeIndex)
        {
            var rows = dataRowsIndexes.Where(item => !IsUnknown(_data[item, attributeIndex])).OrderBy(item =>
            {
                double value;
                if (!_data[item, attributeIndex].TryConvertToNumeric(out value))
                {
                    value = Double.NaN;
                }
                return value;
            }).ToArray();

            var ret = new ComputeAttributeEntropyResult
            {
                IsNumeric = true,
                AttributeIndex = attributeIndex,
                KnownValues = rows.Length
            };

            var rowsCount = rows.Length;
            var minimumAttributeValue = Double.MaxValue;
            int minsplitIndex = 0;

            var freqLeft = new int[_data.ClassesValue.Length];
            var freqRight = new int[_data.ClassesValue.Length];
            var rightListVal = new List<int>();
            var leftListVal = new List<int>();

            foreach (var rowIndex in rows)
            {
                var classVal = _data.Class(rowIndex);
                if (freqRight[classVal] == 0)
                {
                    rightListVal.Add(classVal);
                }
                freqRight[classVal]++;
            }

            var leftCount = 0;
            var rightCount = rows.Length;

            for (int index = 0; index < rowsCount - 1; index++)
            {
                var currentItemClass = _data.Class(rows[index]);
                var nextItemClass = _data.Class(rows[index + 1]);

                var currentItemValue = _data[rows[index], attributeIndex];
                var nextItemValue = _data[rows[index + 1], attributeIndex];
                leftCount++;
                rightCount--;

                #region LeftPart calculation

                if (freqLeft[currentItemClass] == 0)
                {
                    leftListVal.Add(currentItemClass);
                }

                freqLeft[currentItemClass]++;

                #endregion

                #region RightPart calculation

                freqRight[currentItemClass]--;

                #endregion

                if (currentItemClass == nextItemClass) //breakpoints between values of the same class cannot be optimal
                {
                    continue;
                }

                double leftItemValue;
                if (!currentItemValue.TryConvertToNumeric(out leftItemValue))
                {
                    leftItemValue = Double.NaN;
                }

                double rightItemValue;
                if (!nextItemValue.TryConvertToNumeric(out rightItemValue))
                {
                    rightItemValue = Double.NaN;
                }

                if (leftItemValue == rightItemValue)
                {
                    continue;
                }

                var sumLeft = 0.0;
                var sumRight = 0.0;

                foreach (var item in leftListVal)
                {
                    var val = ((double)freqLeft[item]) / leftCount;
                    sumLeft += val * Math.Log(val, 2);
                }

                foreach (var item in rightListVal)
                {
                    if (freqRight[item] == 0)
                    {
                        continue;
                    }
                    var val = ((double)freqRight[item]) / rightCount;
                    sumRight += val * Math.Log(val, 2);
                }

                var leftValue = (((double)leftCount) / rowsCount) * (-sumLeft);
                var rightValue = (((double)rightCount) / rowsCount) * (-sumRight);

                var currentAttributeValue = leftValue + rightValue;
                if (currentAttributeValue < minimumAttributeValue)
                {
                    minsplitIndex = index;
                    minimumAttributeValue = currentAttributeValue;
                }
            }

            double splitValue;
            _data[rows[minsplitIndex], attributeIndex].TryConvertToNumeric(out splitValue);

            ret.EntropyValue = minimumAttributeValue;
            ret.Subsets =
                rows.Split(minsplitIndex)
                    .Select(item => new ComputeAttributeEntropyResult.Subset { Rows = item, Value = splitValue });

            return ret;
        }
        private ComputeAttributeEntropyResult ComputeAttributeEntropyNumericBinary(int[] dataRowsIndexes,
                                                                                   int attributeIndex)
        {
            //var rows = dataRowsIndexes.Where(item => !IsUnknown(_data[item, attributeIndex])).OrderBy(item => (double) _data[item, attributeIndex])
            //        .ToArray();

            var rows = new int[dataRowsIndexes.Length];

            Buffer.BlockCopy(dataRowsIndexes, 0, rows, 0, dataRowsIndexes.Length * sizeof(int));
            Array.Sort(rows, new ComparerAttr(_data, attributeIndex));

            var ret = new ComputeAttributeEntropyResult
            {
                IsNumeric      = true,
                AttributeIndex = attributeIndex,
                KnownValues    = rows.Length
            };

            var minimumAttributeValue = Double.MaxValue;
            int minsplitIndex         = 0;

            var freqLeft     = new int[_data.ClassesValue.Length];
            var freqRight    = new int[_data.ClassesValue.Length];
            var rightListVal = new List <int>();
            var leftListVal  = new List <int>();

            var rowsCount = rows.Length;


            foreach (var rowIndex in rows)
            {
                var classVal = _data.Class(rowIndex);
                if (freqRight[classVal] == 0)
                {
                    rightListVal.Add(classVal);
                }
                freqRight[classVal]++;
            }

            var leftCount  = 0;
            var rightCount = rows.Length;

            for (int index = 0; index < rowsCount - 1; index++)
            {
                var currentItemClass = _data.Class(rows[index]);

                var currentItemValue = _data[rows[index], attributeIndex];
                var nextItemValue    = _data[rows[index + 1], attributeIndex];
                leftCount++;
                rightCount--;

                #region LeftPart calculation

                if (freqLeft[currentItemClass] == 0)
                {
                    leftListVal.Add(currentItemClass);
                }

                freqLeft[currentItemClass]++;

                #endregion

                #region RightPart calculation

                freqRight[currentItemClass]--;

                #endregion

                if (currentItemValue.Equals(nextItemValue))
                {
                    continue;
                }

                var sumLeft  = 0.0;
                var sumRight = 0.0;

                foreach (var item in leftListVal)
                {
                    var val = ((double)freqLeft[item]) / leftCount;
                    sumLeft += val * Math.Log(val, 2);
                }

                foreach (var item in rightListVal)
                {
                    if (freqRight[item] == 0)
                    {
                        continue;
                    }
                    var val = ((double)freqRight[item]) / rightCount;
                    sumRight += val * Math.Log(val, 2);
                }

                var leftProb  = leftCount / (double)rowsCount;
                var rightProb = rightCount / (double)rowsCount;

                var leftValue  = leftProb * (-sumLeft);
                var rightValue = rightProb * (-sumRight);

                var currentAttributeValue = leftValue + rightValue;
                if (currentAttributeValue < minimumAttributeValue)
                {
                    minsplitIndex         = index;
                    minimumAttributeValue = currentAttributeValue;

                    ret.SplitInformation = -(leftProb * Math.Log(leftProb, 2) + rightProb * Math.Log(rightProb, 2));
                }
            }

            double splitValue;
            _data[rows[minsplitIndex], attributeIndex].TryConvertToNumeric(out splitValue);

            ret.EntropyValue = minimumAttributeValue;
            ret.Subsets      = new Lazy <IEnumerable <ComputeAttributeEntropyResult.Subset> >(
                () =>
                rows.Split(minsplitIndex)
                .Select(item => new ComputeAttributeEntropyResult.Subset {
                Rows = item, Value = splitValue
            }));


            return(ret);
        }
        private Tuple <double, ComputeAttributeEntropyResult, Statistics> ComputeBestGain(int[] dataRowsIndexes,
                                                                                          IList <int> attributesIndexes)
        {
            var statistics = ComputeStatistics(dataRowsIndexes);

            if (statistics.SameClass)
            {
                return(new Tuple <double, ComputeAttributeEntropyResult, Statistics>(double.MaxValue, null, statistics));
            }

            var maxGainRatio = Double.MinValue;
            ComputeAttributeEntropyResult minAttributeEntropyResult = null;
            var collectionPartitioner = Partitioner.Create(0, attributesIndexes.Count);

            var locker = new object();

            Parallel.ForEach(collectionPartitioner, (range, loopState) =>
            {
                for (int i = range.Item1; i < range.Item2; i++)
                {
                    var attribute = attributesIndexes[i];

                    var attributeEntropy = ComputeAttributeEntropy(dataRowsIndexes, attribute);
                    var gain             = (statistics.Entropy - attributeEntropy.EntropyValue) * attributeEntropy.KnownValues /
                                           dataRowsIndexes.Length;
                    var gainRatio = gain / attributeEntropy.SplitInformation;

                    lock (locker)
                    {
                        if (minAttributeEntropyResult == null)
                        {
                            minAttributeEntropyResult = attributeEntropy;
                        }
                        if (gainRatio > maxGainRatio)
                        {
                            maxGainRatio = gainRatio;
                            minAttributeEntropyResult = attributeEntropy;
                        }
                        else if (minAttributeEntropyResult != attributeEntropy)
                        {
                            attributeEntropy.Subsets = null;
                        }
                        else if (Double.IsNegativeInfinity(gain))
                        {
                            minAttributeEntropyResult.InvalidAttributes.Add(attribute);
                        }
                    }
                }
            });

            //for (int i = 0; i < attributesIndexes.Count; i++)
            //{
            //    var attribute = attributesIndexes[i];

            //    var attributeEntropy = ComputeAttributeEntropy(dataRowsIndexes, attribute);
            //    var gain = (statistics.Entropy - attributeEntropy.EntropyValue) * attributeEntropy.KnownValues /
            //               dataRowsIndexes.Length;
            //    var gainRatio = gain / attributeEntropy.SplitInformation;

            //    lock (locker)
            //    {
            //        if (minAttributeEntropyResult == null)
            //        {
            //            minAttributeEntropyResult = attributeEntropy;
            //        }
            //        if (gainRatio > maxGainRatio)
            //        {
            //            maxGainRatio = gainRatio;
            //            minAttributeEntropyResult = attributeEntropy;
            //        }
            //        else if (minAttributeEntropyResult != attributeEntropy)
            //        {
            //            attributeEntropy.Subsets = null;
            //        }
            //        else if (Double.IsNegativeInfinity(gain))
            //        {
            //            minAttributeEntropyResult.InvalidAttributes.Add(attribute);
            //        }
            //    }
            //}

            return(new Tuple <double, ComputeAttributeEntropyResult, Statistics>(maxGainRatio, minAttributeEntropyResult,
                                                                                 statistics));
        }