public void SplitForCrossValidation(int numFolds, int fold, out LabeledDataset <LblT, ExT> trainSet, out LabeledDataset <LblT, ExT> testSet) { Utils.ThrowException(mItems.Count < 2 ? new InvalidOperationException() : null); Utils.ThrowException((numFolds <2 || numFolds> mItems.Count) ? new ArgumentOutOfRangeException("numFolds") : null); Utils.ThrowException((fold <1 || fold> numFolds) ? new ArgumentOutOfRangeException("fold") : null); trainSet = new LabeledDataset <LblT, ExT>(); testSet = new LabeledDataset <LblT, ExT>(); double step = (double)mItems.Count / (double)numFolds; double d = 0; for (int i = 0; i < numFolds; i++, d += step) { int endJ = (int)Math.Round(d + step); if (i == fold - 1) { for (int j = (int)Math.Round(d); j < endJ; j++) { testSet.Add(mItems[j].Label, mItems[j].Example); } } else { for (int j = (int)Math.Round(d); j < endJ; j++) { trainSet.Add(mItems[j].Label, mItems[j].Example); } } } }
public static LabeledDataset <int, SparseVector <double> > LoadDataset(StreamReader reader) { Utils.ThrowException(reader == null ? new ArgumentNullException("reader") : null); string line; LabeledDataset <int, SparseVector <double> > dataset = new LabeledDataset <int, SparseVector <double> >(); while ((line = reader.ReadLine()) != null) { if (!line.StartsWith("#")) { Match labelMatch = new Regex(@"^(?<label>[+-]?\d+(\.\d+)?)(\s|$)").Match(line); Utils.ThrowException(!labelMatch.Success ? new IOException() : null); int label = Convert.ToInt32(labelMatch.Result("${label}")); Match match = new Regex(@"(?<feature>\d+):(?<weight>[-]?[\d\.]+)").Match(line); SparseVector <double> vec = new SparseVector <double>(); while (match.Success) { int feature = Convert.ToInt32(match.Result("${feature}")); double weight = Convert.ToDouble(match.Result("${weight}"), CultureInfo.InvariantCulture); match = match.NextMatch(); vec[feature] = weight; } dataset.Add(new LabeledExample <int, SparseVector <double> >(label, vec)); } } return(dataset); }
public LabeledDataset <Cluster, ExT> GetClassificationDataset <ExT>(IUnlabeledExampleCollection <ExT> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); LabeledDataset <Cluster, ExT> classificationDataset = new LabeledDataset <Cluster, ExT>(); FillClassificationDataset(mRoots, dataset, classificationDataset); // throws ArgumentValueException return(classificationDataset); }
public static LabeledDataset <int, SparseVector <double> > LoadDataset(string fileName) { Utils.ThrowException(fileName == null ? new ArgumentNullException("fileName") : null); Utils.ThrowException(!Utils.VerifyFileNameOpen(fileName) ? new ArgumentValueException("fileName") : null); StreamReader reader = new StreamReader(fileName); LabeledDataset <int, SparseVector <double> > dataset = LoadDataset(reader); reader.Close(); return(dataset); }
public ILabeledDataset <LblT> ConvertDataset(Type newExType, bool move) { Utils.ThrowException(newExType == null ? new ArgumentNullException("newExType") : null); ILabeledDataset <LblT> newDataset = null; ArrayList <LabeledExample <LblT, object> > tmp = new ArrayList <LabeledExample <LblT, object> >(mItems.Count); for (int i = 0; i < mItems.Count; i++) { tmp.Add(new LabeledExample <LblT, object>(mItems[i].Label, ModelUtils.ConvertExample(mItems[i].Example, newExType))); // throws ArgumentValueException if (move) { mItems[i] = null; } } if (move) { mItems.Clear(); } if (newExType == typeof(SparseVector <double>)) { newDataset = new LabeledDataset <LblT, SparseVector <double> >(tmp); } else if (newExType == typeof(SparseVector <double> .ReadOnly)) { newDataset = new LabeledDataset <LblT, SparseVector <double> .ReadOnly>(tmp); } else if (newExType == typeof(BinaryVector)) { newDataset = new LabeledDataset <LblT, BinaryVector>(tmp); } else if (newExType == typeof(BinaryVector.ReadOnly)) { newDataset = new LabeledDataset <LblT, BinaryVector.ReadOnly>(tmp); } else { throw new ArgumentNotSupportedException("newExType"); } return(newDataset); }
public UnlabeledDatasetEnumerator(LabeledDataset <LblT, ExT> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); mDataset = dataset; }
public void SplitForStratifiedCrossValidation(int numFolds, int fold, out LabeledDataset <LblT, ExT> trainSet, out LabeledDataset <LblT, ExT> testSet) { Utils.ThrowException(mItems.Count < 2 ? new InvalidOperationException() : null); Utils.ThrowException((numFolds <2 || numFolds> mItems.Count) ? new ArgumentOutOfRangeException("numFolds") : null); Utils.ThrowException((fold <1 || fold> numFolds) ? new ArgumentOutOfRangeException("fold") : null); // calc label segments var labelSegments = new List <Pair <LblT, int> >(); LblT label = default(LblT); for (int i = 0, startN = 0;; i++) { if (i > 0 && (i == mItems.Count || !label.Equals(mItems[i].Label))) { Utils.ThrowException(labelSegments.Any(p => p.First.Equals(label)) ? new InvalidOperationException("items not sorted") : null); labelSegments.Add(new Pair <LblT, int>(label, i - startN)); startN = i; } if (i == mItems.Count) { break; } label = mItems[i].Label; } Utils.ThrowException(mItems.Count < numFolds * labelSegments.Count ? new ArgumentException("dataset too small to stratify") : null); // populate sets trainSet = new LabeledDataset <LblT, ExT>(); testSet = new LabeledDataset <LblT, ExT>(); int segStart = 0; foreach (Pair <LblT, int> segment in labelSegments) { int len = segment.Second / numFolds; int testStart = segStart + (fold - 1) * len; int mod = segment.Second % numFolds; if (fold <= mod) { len++; testStart += fold - 1; } else { testStart += mod; } int testEnd = testStart + len; for (int i = segStart; i < testStart; i++) { trainSet.Add(mItems[i].Label, mItems[i].Example); } for (int i = testStart; i < testEnd; i++) { testSet.Add(mItems[i].Label, mItems[i].Example); } int segEnd = segStart + segment.Second; for (int i = testEnd; i < segEnd; i++) { trainSet.Add(mItems[i].Label, mItems[i].Example); } segStart = segEnd; } }
private void FillClassificationDataset <ExT>(IEnumerable <Cluster> clusters, IUnlabeledExampleCollection <ExT> dataset, LabeledDataset <Cluster, ExT> classificationDataset) { foreach (Cluster cluster in clusters) { foreach (int item in cluster.Items) { Utils.ThrowException(item < 0 || item >= dataset.Count ? new ArgumentValueException("clusters") : null); classificationDataset.Add(cluster, dataset[item]); } FillClassificationDataset(cluster.Children, dataset, classificationDataset); } }