// @Assumption: labels are stored as a natural range; that is, their values e [0, #classes[
        /// <remarks>
        /// Predicted labels are stored in the columns.
        /// Actual labels are stored in the rows.
        /// </remarks>
        public static Matrix <int> Create(Array <ILabel> actualLabels, Array <ILabel> predictedLabels, int classCount)
        {
            if (actualLabels.Length != predictedLabels.Length)
            {
                throw new InvalidOperationException();
            }

            var absoluteConfusionMatrix = new MutableMatrix <int>(
                rowCount: classCount,
                columnCount: classCount);

            var instanceCount = actualLabels.Length;

            for (int instanceIndex = 0; instanceIndex < instanceCount; instanceIndex++)
            {
                var actual    = ((SingleLabel)actualLabels[instanceIndex]).Value;
                var predicted = ((SingleLabel)predictedLabels[instanceIndex]).Value;

                var oldConfusionValue = absoluteConfusionMatrix.Get(
                    rowIndex: actual,
                    columnIndex: predicted);

                absoluteConfusionMatrix.Set(
                    rowIndex: actual,
                    columnIndex: predicted,
                    oldConfusionValue + 1);
            }

            return(absoluteConfusionMatrix.ToMatrix());
        }
예제 #2
0
        private static MutableMatrix <float> ReadDataFile(string filename)
        {
            var recordCount = GetNumberOfRecords(filename);
            var fieldCount  = GetAndValidateNumberOfFields(filename);

            var matrix = new MutableMatrix <float>(rowCount: recordCount, columnCount: fieldCount);

            using var streamReader = new StreamReader(path: filename);
            using var csvReader    = new CsvReader(reader: streamReader, CultureInfo.InvariantCulture);

            for (int i = 0; i < recordCount; i++)
            {
                csvReader.Read();

                for (int j = 0; j < fieldCount; j++)
                {
                    var fieldValue = csvReader.GetField(j);

                    var parsed = float.TryParse(fieldValue, out var parsedValue);
                    if (!parsed)
                    {
                        throw new InvalidOperationException($"Parsing error. Can't parse {fieldValue} as float. Line {i}. File: {filename}.");
                    }

                    matrix.Set(rowIndex: i, columnIndex: j, value: parsedValue);
                }
            }

            return(matrix);
        }
예제 #3
0
        public static Matrix <double> ComputeEuclideanDistanceMatrix(Matrix <float> datasetInstaces)
        {
            var instanceCount = datasetInstaces.RowCount;
            var distances     = new MutableMatrix <double>(instanceCount, instanceCount);

            Parallel.For(0, instanceCount, i => {
                var lhs = datasetInstaces.GetRow(i);
                for (int j = 0; j < instanceCount; j++)
                {
                    var rhs      = datasetInstaces.GetRow(j);
                    var distance = Euclidean(lhs, rhs);
                    distances.Set(i, j, distance);
                }
            });

            return(distances.ToMatrix());
        }
예제 #4
0
        public static Dataset CreateFromMutableObjects(
            FeatureType[] mutableFeatureTypes,
            MutableMatrix <float> mutableData,
            Array <ILabel> labels,
            bool isTrainDataset,
            ClassificationType classificationType
            )
        {
            if (mutableFeatureTypes.Length != mutableData.ColumnCount)
            {
                throw new ArgumentException("featureTypes.Length must be equal to  data.ColumnCount");
            }
            if (mutableData.RowCount != labels.Length)
            {
                throw new ArgumentException("label.RowCount must be equal to data.RowCount");
            }

            var featureTypes   = mutableFeatureTypes.ToArray();
            var data           = mutableData.ToMatrix();
            var dataTransposed = mutableData.Transpose().ToMatrix();

            var featuresCount = featureTypes.Length;

            var sortedFeatureValues       = new float[featuresCount][];
            var sortedUniqueFeatureValues = new float[featuresCount][];
            var featureValueFrequencies   = new Dictionary <float, int> [featuresCount];
            var dimensionIntervals        = new IInterval[featuresCount];

            var distanceMatrixTask = Task.Run(() => Distance.ComputeEuclideanDistanceMatrix(data));

            Parallel.For(fromInclusive: 0, toExclusive: featuresCount, body: featureIndex => {
                var currentFeatureValues = dataTransposed.GetRow(featureIndex).ToArray();

                ThrowIfDatasetContainsNonFiniteValues(currentFeatureValues);

                var sufv = currentFeatureValues
                           .Distinct()
                           .OrderBy(v => v)
                           .ToArray();

                sortedUniqueFeatureValues[featureIndex] = sufv;

                ThrowIfTrainDatasetContainsFeaturesWithSingleValue(isTrainDataset, featureIndex, sufv);

                sortedFeatureValues[featureIndex] = currentFeatureValues
                                                    .OrderBy(v => v)
                                                    .ToArray();

                var counts = currentFeatureValues
                             .GroupBy(v => v)
                             .ToDictionary(
                    keySelector: g => g.Key,
                    elementSelector: g => g.Count());

                featureValueFrequencies[featureIndex] = counts;
            });

            int classCount = classificationType switch
            {
                ClassificationType.SingleLabel => labels.Distinct().Count(),
                ClassificationType.MultiLabel => ((MultiLabel)labels[0]).Values.Length,
                _ => throw CommonExceptions.UnknownClassificationType,
            };

            var classFrequencies = classificationType switch
            {
                ClassificationType.SingleLabel => ComputeSingleLabelClassFrequencies(labels, classCount),
                ClassificationType.MultiLabel => ComputeMultiLabelClassFrequencies(labels, classCount),
                _ => throw CommonExceptions.UnknownClassificationType,
            };

            var defaultLabel = ComputeDefaultLabel(labels, classificationType);

            Task.WaitAll(distanceMatrixTask);
            var distanceMatrix = distanceMatrixTask.Result;

            return(new Dataset(
                       instanceCount: data.RowCount,
                       featureCount: data.ColumnCount,
                       classCount: classCount,
                       classificationType: classificationType,
                       instanceLabels: labels,
                       featureTypes: featureTypes,
                       data: data,
                       distanceMatrix: distanceMatrix,
                       classFrequencies: classFrequencies,
                       defaultLabel: defaultLabel,
                       sortedUniqueFeatureValues: sortedUniqueFeatureValues));