protected override ILabeledDataset <SentimentLabel, SparseVector <double> > MapTestSet(int foldN, ILabeledDataset <SentimentLabel, Tweet> testSet) { return(new LabeledDataset <SentimentLabel, SparseVector <double> >(testSet.Select(le => { SparseVector <double> sparseVector = mFoldBowSpaces[foldN].ProcessDocument(le.Example.Text); return new LabeledExample <SentimentLabel, SparseVector <double> >(le.Label, sparseVector); }))); }
protected override ILabeledDataset <LblT, SparseVector <double> > MapTrainSet(int foldN, ILabeledDataset <LblT, string> trainSet) { BowSpace bowSpace; Preconditions.CheckState(!mFoldBowSpaces.TryGetValue(foldN, out bowSpace)); Preconditions.CheckState(mFoldBowSpaces.TryAdd(foldN, bowSpace = BowSpaceFunc())); List <SparseVector <double> > bowData = bowSpace is DeltaBowSpace <LblT> ?((DeltaBowSpace <LblT>)bowSpace).Initialize(trainSet) : bowSpace.Initialize(trainSet.Select(d => d.Example)); var bowDataset = new LabeledDataset <LblT, SparseVector <double> >(); for (int i = 0; i < bowData.Count; i++) { bowDataset.Add(trainSet[i].Label, bowData[i]); } return(bowDataset); }
public List <SparseVector <double> > Initialize(ILabeledDataset <LabelT, string> labeledDataset, bool largeScale) { bool normalizeVectors = NormalizeVectors; NormalizeVectors = false; List <SparseVector <double> > bowData = base.Initialize(labeledDataset.Select(d => d.Example), largeScale); NormalizeVectors = normalizeVectors; // count word label frequencies var labelWordCounts = new Dictionary <LabelT, Dictionary <int, int> >(); for (int i = 0; i < bowData.Count; i++) { foreach (IdxDat <double> idxDat in bowData[i]) { LabelT label = labeledDataset[i].Label; Dictionary <int, int> wordCounts; if (!labelWordCounts.TryGetValue(label, out wordCounts)) { labelWordCounts.Add(label, wordCounts = new Dictionary <int, int>()); } int count; if (!wordCounts.TryGetValue(idxDat.Idx, out count)) { wordCounts.Add(idxDat.Idx, 1); } else { wordCounts[idxDat.Idx] = count + 1; } } } // calc deltas int labelCount = labelWordCounts.Count; var counts = new List <double>(); foreach (Word word in Words) { counts.Clear(); foreach (KeyValuePair <LabelT, Dictionary <int, int> > kv in labelWordCounts) { int count; if (kv.Value.TryGetValue(word.mIdx, out count)) { counts.Add(count); } } if (counts.Any()) { double max = counts.Max(); mWordDeltas.Add(word.mIdx, Math.Abs(Math.Log( max / Math.Max(counts.Sum() - max, 1) * (labelCount - 1), 2))); } else { mWordDeltas.Add(word.mIdx, 1); } } // transform vectors using deltas var bowDataset = new List <SparseVector <double> >(); foreach (SparseVector <double> bow in bowData) { CalcDeltaBow(bow, normalizeVectors); bowDataset.Add(bow); } return(bowDataset); }