예제 #1
0
        public void SplitForCrossValidation(int numFolds, int fold, out LabeledDataset <LblT, ExT> trainSet, out LabeledDataset <LblT, ExT> testSet)
        {
            Utils.ThrowException(mItems.Count < 2 ? new InvalidOperationException() : null);
            Utils.ThrowException((numFolds <2 || numFolds> mItems.Count) ? new ArgumentOutOfRangeException("numFolds") : null);
            Utils.ThrowException((fold <1 || fold> numFolds) ? new ArgumentOutOfRangeException("fold") : null);
            trainSet = new LabeledDataset <LblT, ExT>();
            testSet  = new LabeledDataset <LblT, ExT>();
            double step = (double)mItems.Count / (double)numFolds;
            double d    = 0;

            for (int i = 0; i < numFolds; i++, d += step)
            {
                int endJ = (int)Math.Round(d + step);
                if (i == fold - 1)
                {
                    for (int j = (int)Math.Round(d); j < endJ; j++)
                    {
                        testSet.Add(mItems[j].Label, mItems[j].Example);
                    }
                }
                else
                {
                    for (int j = (int)Math.Round(d); j < endJ; j++)
                    {
                        trainSet.Add(mItems[j].Label, mItems[j].Example);
                    }
                }
            }
        }
예제 #2
0
        public static LabeledDataset <int, SparseVector <double> > LoadDataset(StreamReader reader)
        {
            Utils.ThrowException(reader == null ? new ArgumentNullException("reader") : null);
            string line;
            LabeledDataset <int, SparseVector <double> > dataset = new LabeledDataset <int, SparseVector <double> >();

            while ((line = reader.ReadLine()) != null)
            {
                if (!line.StartsWith("#"))
                {
                    Match labelMatch = new Regex(@"^(?<label>[+-]?\d+(\.\d+)?)(\s|$)").Match(line);
                    Utils.ThrowException(!labelMatch.Success ? new IOException() : null);
                    int   label = Convert.ToInt32(labelMatch.Result("${label}"));
                    Match match = new Regex(@"(?<feature>\d+):(?<weight>[-]?[\d\.]+)").Match(line);
                    SparseVector <double> vec = new SparseVector <double>();
                    while (match.Success)
                    {
                        int    feature = Convert.ToInt32(match.Result("${feature}"));
                        double weight  = Convert.ToDouble(match.Result("${weight}"), CultureInfo.InvariantCulture);
                        match        = match.NextMatch();
                        vec[feature] = weight;
                    }
                    dataset.Add(new LabeledExample <int, SparseVector <double> >(label, vec));
                }
            }
            return(dataset);
        }
예제 #3
0
        public LabeledDataset <Cluster, ExT> GetClassificationDataset <ExT>(IUnlabeledExampleCollection <ExT> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            LabeledDataset <Cluster, ExT> classificationDataset = new LabeledDataset <Cluster, ExT>();

            FillClassificationDataset(mRoots, dataset, classificationDataset); // throws ArgumentValueException
            return(classificationDataset);
        }
예제 #4
0
        public static LabeledDataset <int, SparseVector <double> > LoadDataset(string fileName)
        {
            Utils.ThrowException(fileName == null ? new ArgumentNullException("fileName") : null);
            Utils.ThrowException(!Utils.VerifyFileNameOpen(fileName) ? new ArgumentValueException("fileName") : null);
            StreamReader reader = new StreamReader(fileName);
            LabeledDataset <int, SparseVector <double> > dataset = LoadDataset(reader);

            reader.Close();
            return(dataset);
        }
예제 #5
0
        public ILabeledDataset <LblT> ConvertDataset(Type newExType, bool move)
        {
            Utils.ThrowException(newExType == null ? new ArgumentNullException("newExType") : null);
            ILabeledDataset <LblT> newDataset = null;
            ArrayList <LabeledExample <LblT, object> > tmp = new ArrayList <LabeledExample <LblT, object> >(mItems.Count);

            for (int i = 0; i < mItems.Count; i++)
            {
                tmp.Add(new LabeledExample <LblT, object>(mItems[i].Label, ModelUtils.ConvertExample(mItems[i].Example, newExType))); // throws ArgumentValueException
                if (move)
                {
                    mItems[i] = null;
                }
            }
            if (move)
            {
                mItems.Clear();
            }
            if (newExType == typeof(SparseVector <double>))
            {
                newDataset = new LabeledDataset <LblT, SparseVector <double> >(tmp);
            }
            else if (newExType == typeof(SparseVector <double> .ReadOnly))
            {
                newDataset = new LabeledDataset <LblT, SparseVector <double> .ReadOnly>(tmp);
            }
            else if (newExType == typeof(BinaryVector))
            {
                newDataset = new LabeledDataset <LblT, BinaryVector>(tmp);
            }
            else if (newExType == typeof(BinaryVector.ReadOnly))
            {
                newDataset = new LabeledDataset <LblT, BinaryVector.ReadOnly>(tmp);
            }
            else
            {
                throw new ArgumentNotSupportedException("newExType");
            }
            return(newDataset);
        }
예제 #6
0
 public UnlabeledDatasetEnumerator(LabeledDataset <LblT, ExT> dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     mDataset = dataset;
 }
예제 #7
0
        public void SplitForStratifiedCrossValidation(int numFolds, int fold, out LabeledDataset <LblT, ExT> trainSet, out LabeledDataset <LblT, ExT> testSet)
        {
            Utils.ThrowException(mItems.Count < 2 ? new InvalidOperationException() : null);
            Utils.ThrowException((numFolds <2 || numFolds> mItems.Count) ? new ArgumentOutOfRangeException("numFolds") : null);
            Utils.ThrowException((fold <1 || fold> numFolds) ? new ArgumentOutOfRangeException("fold") : null);

            // calc label segments

            var  labelSegments = new List <Pair <LblT, int> >();
            LblT label         = default(LblT);

            for (int i = 0, startN = 0;; i++)
            {
                if (i > 0 && (i == mItems.Count || !label.Equals(mItems[i].Label)))
                {
                    Utils.ThrowException(labelSegments.Any(p => p.First.Equals(label)) ? new InvalidOperationException("items not sorted") : null);
                    labelSegments.Add(new Pair <LblT, int>(label, i - startN));
                    startN = i;
                }
                if (i == mItems.Count)
                {
                    break;
                }
                label = mItems[i].Label;
            }
            Utils.ThrowException(mItems.Count < numFolds * labelSegments.Count ? new ArgumentException("dataset too small to stratify") : null);

            // populate sets

            trainSet = new LabeledDataset <LblT, ExT>();
            testSet  = new LabeledDataset <LblT, ExT>();
            int segStart = 0;

            foreach (Pair <LblT, int> segment in labelSegments)
            {
                int len       = segment.Second / numFolds;
                int testStart = segStart + (fold - 1) * len;
                int mod       = segment.Second % numFolds;
                if (fold <= mod)
                {
                    len++; testStart += fold - 1;
                }
                else
                {
                    testStart += mod;
                }
                int testEnd = testStart + len;

                for (int i = segStart; i < testStart; i++)
                {
                    trainSet.Add(mItems[i].Label, mItems[i].Example);
                }
                for (int i = testStart; i < testEnd; i++)
                {
                    testSet.Add(mItems[i].Label, mItems[i].Example);
                }
                int segEnd = segStart + segment.Second;
                for (int i = testEnd; i < segEnd; i++)
                {
                    trainSet.Add(mItems[i].Label, mItems[i].Example);
                }
                segStart = segEnd;
            }
        }
예제 #8
0
 private void FillClassificationDataset <ExT>(IEnumerable <Cluster> clusters, IUnlabeledExampleCollection <ExT> dataset, LabeledDataset <Cluster, ExT> classificationDataset)
 {
     foreach (Cluster cluster in clusters)
     {
         foreach (int item in cluster.Items)
         {
             Utils.ThrowException(item < 0 || item >= dataset.Count ? new ArgumentValueException("clusters") : null);
             classificationDataset.Add(cluster, dataset[item]);
         }
         FillClassificationDataset(cluster.Children, dataset, classificationDataset);
     }
 }