public static void TestDataset() { Dataset <string, string> data = new Dataset <string, string>(); data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "fever", "cough", "congestion" }), "cold")); data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "fever", "cough", "nausea" }), "flu")); data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "cough", "congestion" }), "cold")); // data.summaryStatistics(); NUnit.Framework.Assert.AreEqual(4, data.NumFeatures()); NUnit.Framework.Assert.AreEqual(4, data.NumFeatureTypes()); NUnit.Framework.Assert.AreEqual(2, data.NumClasses()); NUnit.Framework.Assert.AreEqual(8, data.NumFeatureTokens()); NUnit.Framework.Assert.AreEqual(3, data.Size()); data.ApplyFeatureCountThreshold(2); NUnit.Framework.Assert.AreEqual(3, data.NumFeatures()); NUnit.Framework.Assert.AreEqual(3, data.NumFeatureTypes()); NUnit.Framework.Assert.AreEqual(2, data.NumClasses()); NUnit.Framework.Assert.AreEqual(7, data.NumFeatureTokens()); NUnit.Framework.Assert.AreEqual(3, data.Size()); //Dataset data = Dataset.readSVMLightFormat(args[0]); //double[] scores = data.getInformationGains(); //System.out.println(ArrayMath.mean(scores)); //System.out.println(ArrayMath.variance(scores)); LinearClassifierFactory <string, string> factory = new LinearClassifierFactory <string, string>(); LinearClassifier <string, string> classifier = factory.TrainClassifier(data); IDatum <string, string> d = new BasicDatum <string, string>(Arrays.AsList(new string[] { "cough", "fever" })); NUnit.Framework.Assert.AreEqual("Classification incorrect", "flu", classifier.ClassOf(d)); ICounter <string> probs = classifier.ProbabilityOf(d); NUnit.Framework.Assert.AreEqual("Returned probability incorrect", 0.4553, probs.GetCount("cold"), 0.0001); NUnit.Framework.Assert.AreEqual("Returned probability incorrect", 0.5447, probs.GetCount("flu"), 0.0001); System.Console.Out.WriteLine(); }
public static IDatum <string, string> SvmLightLineToDatum(string l) { line1++; l = l.ReplaceAll("#.*", string.Empty); // remove any trailing comments string[] line = l.Split("\\s+"); ICollection <string> features = new List <string>(); for (int i = 1; i < line.Length; i++) { string[] f = line[i].Split(":"); if (f.Length != 2) { logger.Info("Dataset error: line " + line1); } int val = (int)double.ParseDouble(f[1]); for (int j = 0; j < val; j++) { features.Add(f[0]); } } features.Add(int.MaxValue.ToString()); // a constant feature for a class IDatum <string, string> d = new BasicDatum <string, string>(features, line[0]); return(d); }
/// <summary>Get Number of datums a given feature appears in.</summary> public virtual ICounter <F> GetFeatureCounter() { ICounter <F> featureCounts = new ClassicCounter <F>(); for (int i = 0; i < this.Size(); i++) { BasicDatum <L, F> datum = (BasicDatum <L, F>)GetDatum(i); ICollection <F> featureSet = Generics.NewHashSet(datum.AsFeatures()); foreach (F key in featureSet) { featureCounts.IncrementCount(key, 1.0); } } return(featureCounts); }
private void EnsureProbs(int word, bool subtractTagScore) { if (word == lastWord) { return; } lastWord = word; if (functionWordTags.Contains(wordIndex.Get(word))) { logProbs = new ClassicCounter <string>(); string trueTag = functionWordTags[wordIndex.Get(word)]; foreach (string tag in tagIndex.ObjectsList()) { if (ctlp.BasicCategory(tag).Equals(trueTag)) { logProbs.SetCount(tag, 0); } else { logProbs.SetCount(tag, double.NegativeInfinity); } } return; } IDatum datum = new BasicDatum(featExtractor.MakeFeatures(wordIndex.Get(word))); logProbs = scorer.LogProbabilityOf(datum); if (subtractTagScore) { ICollection <string> tagSet = logProbs.KeySet(); foreach (string tag in tagSet) { logProbs.IncrementCount(tag, -Math.Log(tagDist.ProbabilityOf(tag))); } } }