Ejemplo n.º 1
0
        /// <summary>
        /// Create a new Classification by randomly selecting a subset of the points in the current Classification.
        /// </summary>
        /// <param name="sampleSize">Number of points to include in the new Classification.</param>
        /// <returns>A new Classification that has the given number of points.</returns>
        public Classification <TPoint, TLabel> Sample(int sampleSize)
        {
            var subset = new Classification <TPoint, TLabel>();

            foreach (var point in Points().TakeRandom(sampleSize, NumPoints))
            {
                subset.Add(point, GetClassLabel(point));
            }
            return(subset);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Clone the Classification, using the same Points and labels (semi-shallow copy).
        /// </summary>
        public object Clone()
        {
            var copy = new Classification <TPoint, TLabel>();

            foreach (var point in Points())
            {
                copy.Add(point, GetClassLabel(point));
            }
            return(copy);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Load the data from a file or standard input unless it has already been loaded into InitialClassification.
        ///
        /// It must have a header row if Configuration.Data.ReadHeader is true.
        /// Field values may be separated by commas or tabs.
        /// </summary>
        void LoadData()
        {
            if (IsDataLoaded)
            {
                return;
            }
            Timer.Start("Load data");
            InitialClassification = new Classification <UnsignedPoint, string>();
            InputOrder            = new List <UnsignedPoint>();
            InputDataIds          = new Dictionary <UnsignedPoint, string>();
            IEnumerable <string> lines;

            if (Configuration.Data.ReadFromStandardIn())
            {
                lines = ReadLinesFromConsole();
            }
            else
            {
                lines = File.ReadLines(Configuration.Data.InputDataFile);
            }
            var idPosition       = -1;
            var categoryPosition = -1;

            if (!Configuration.Data.ReadHeader)
            {
                idPosition       = SafeParseInt(Configuration.Data.IdField, -1);
                categoryPosition = SafeParseInt(Configuration.Data.CategoryField, -1);
            }
            var rownum = Configuration.Data.ReadHeader ? 0 : 1;

            string[] header = null;
            foreach (var values in lines.Select(line => line.Split(new [] { ',', '\t' })))
            {
                // Skip blank lines and comments
                if (values.Length < 2)
                {
                    continue;
                }
                if (rownum == 0 && Configuration.Data.ReadHeader)
                {
                    // Identify which columns hold the Id and the Category, if any.
                    // If no column holds the Id, then the one-based row number will be used.
                    // Regardless of whether the file has a header row, row number one
                    // is the first row with data, not column headings.
                    header = values;
                    var tryIdPosition = Array.FindIndex(
                        header,
                        heading => heading.ToUpperInvariant().Equals(Configuration.Data.IdField.ToUpperInvariant())
                        );
                    if (tryIdPosition != -1)
                    {
                        idPosition = tryIdPosition;
                    }
                    var tryCategoryPosition = Array.FindIndex(
                        header,
                        heading => heading.ToUpperInvariant().Equals(Configuration.Data.CategoryField.ToUpperInvariant())
                        );
                    if (tryCategoryPosition != -1)
                    {
                        categoryPosition = tryCategoryPosition;
                    }
                }
                else
                {
                    int    id;
                    string idString;
                    if (idPosition == -1)
                    {
                        id       = rownum;
                        idString = rownum.ToString();
                    }
                    else
                    {
                        // If the id is not a number, we use the rownum in the points we create as the id, but
                        // make a correspondence between the string and the point.
                        id       = SafeParseInt(values[idPosition], rownum);
                        idString = values[idPosition];
                    }
                    string categoryString;
                    if (categoryPosition == -1)
                    {
                        categoryString = rownum.ToString();                         // Unclassified - all points in their own cluster.
                    }
                    else
                    {
                        categoryString = values[categoryPosition];
                    }

                    var coordinates = new List <uint>();
                    foreach (var pair in values.Select((v, i) => new { Value = v, Position = i }))
                    {
                        //TODO: Reject negative values and log.
                        if (pair.Position != idPosition && pair.Position != categoryPosition)
                        {
                            coordinates.Add((uint)SafeParseInt(pair.Value, 0));
                        }
                    }
                    var point = new UnsignedPoint(coordinates, id);
                    //TODO: Check for duplicate ids and log.
                    InputDataIds[point] = idString;
                    InitialClassification.Add(point, categoryString);
                    InputOrder.Add(point);
                }
                rownum++;
            }
            Timer.Stop("Load data");
        }