public static void TestDataset()
        {
            Dataset <string, string> data = new Dataset <string, string>();

            data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "fever", "cough", "congestion" }), "cold"));
            data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "fever", "cough", "nausea" }), "flu"));
            data.Add(new BasicDatum <string, string>(Arrays.AsList(new string[] { "cough", "congestion" }), "cold"));
            // data.summaryStatistics();
            NUnit.Framework.Assert.AreEqual(4, data.NumFeatures());
            NUnit.Framework.Assert.AreEqual(4, data.NumFeatureTypes());
            NUnit.Framework.Assert.AreEqual(2, data.NumClasses());
            NUnit.Framework.Assert.AreEqual(8, data.NumFeatureTokens());
            NUnit.Framework.Assert.AreEqual(3, data.Size());
            data.ApplyFeatureCountThreshold(2);
            NUnit.Framework.Assert.AreEqual(3, data.NumFeatures());
            NUnit.Framework.Assert.AreEqual(3, data.NumFeatureTypes());
            NUnit.Framework.Assert.AreEqual(2, data.NumClasses());
            NUnit.Framework.Assert.AreEqual(7, data.NumFeatureTokens());
            NUnit.Framework.Assert.AreEqual(3, data.Size());
            //Dataset data = Dataset.readSVMLightFormat(args[0]);
            //double[] scores = data.getInformationGains();
            //System.out.println(ArrayMath.mean(scores));
            //System.out.println(ArrayMath.variance(scores));
            LinearClassifierFactory <string, string> factory    = new LinearClassifierFactory <string, string>();
            LinearClassifier <string, string>        classifier = factory.TrainClassifier(data);
            IDatum <string, string> d = new BasicDatum <string, string>(Arrays.AsList(new string[] { "cough", "fever" }));

            NUnit.Framework.Assert.AreEqual("Classification incorrect", "flu", classifier.ClassOf(d));
            ICounter <string> probs = classifier.ProbabilityOf(d);

            NUnit.Framework.Assert.AreEqual("Returned probability incorrect", 0.4553, probs.GetCount("cold"), 0.0001);
            NUnit.Framework.Assert.AreEqual("Returned probability incorrect", 0.5447, probs.GetCount("flu"), 0.0001);
            System.Console.Out.WriteLine();
        }
        public static IDatum <string, string> SvmLightLineToDatum(string l)
        {
            line1++;
            l = l.ReplaceAll("#.*", string.Empty);
            // remove any trailing comments
            string[]             line     = l.Split("\\s+");
            ICollection <string> features = new List <string>();

            for (int i = 1; i < line.Length; i++)
            {
                string[] f = line[i].Split(":");
                if (f.Length != 2)
                {
                    logger.Info("Dataset error: line " + line1);
                }
                int val = (int)double.ParseDouble(f[1]);
                for (int j = 0; j < val; j++)
                {
                    features.Add(f[0]);
                }
            }
            features.Add(int.MaxValue.ToString());
            // a constant feature for a class
            IDatum <string, string> d = new BasicDatum <string, string>(features, line[0]);

            return(d);
        }
        /// <summary>Get Number of datums a given feature appears in.</summary>
        public virtual ICounter <F> GetFeatureCounter()
        {
            ICounter <F> featureCounts = new ClassicCounter <F>();

            for (int i = 0; i < this.Size(); i++)
            {
                BasicDatum <L, F> datum      = (BasicDatum <L, F>)GetDatum(i);
                ICollection <F>   featureSet = Generics.NewHashSet(datum.AsFeatures());
                foreach (F key in featureSet)
                {
                    featureCounts.IncrementCount(key, 1.0);
                }
            }
            return(featureCounts);
        }
示例#4
0
        private void EnsureProbs(int word, bool subtractTagScore)
        {
            if (word == lastWord)
            {
                return;
            }
            lastWord = word;
            if (functionWordTags.Contains(wordIndex.Get(word)))
            {
                logProbs = new ClassicCounter <string>();
                string trueTag = functionWordTags[wordIndex.Get(word)];
                foreach (string tag in tagIndex.ObjectsList())
                {
                    if (ctlp.BasicCategory(tag).Equals(trueTag))
                    {
                        logProbs.SetCount(tag, 0);
                    }
                    else
                    {
                        logProbs.SetCount(tag, double.NegativeInfinity);
                    }
                }
                return;
            }
            IDatum datum = new BasicDatum(featExtractor.MakeFeatures(wordIndex.Get(word)));

            logProbs = scorer.LogProbabilityOf(datum);
            if (subtractTagScore)
            {
                ICollection <string> tagSet = logProbs.KeySet();
                foreach (string tag in tagSet)
                {
                    logProbs.IncrementCount(tag, -Math.Log(tagDist.ProbabilityOf(tag)));
                }
            }
        }