public void TestSimpleTraining()
        {
            IObjectStream <DocumentSample> samples = new GenericObjectStream <DocumentSample>(
                new DocumentSample("1", new[] { "a", "b", "c", "1", "2" }),
                new DocumentSample("1", new[] { "a", "b", "c", "3", "4" }),
                new DocumentSample("0", new[] { "x", "y", "z" }),
                new DocumentSample("0", new[] { "x", "y", "z", "5", "6" }),
                new DocumentSample("0", new[] { "x", "y", "z", "7", "8" }));

            var param = new TrainingParameters();

            param.Set(Parameters.Iterations, "100");
            param.Set(Parameters.Cutoff, "0");
            param.Set(Parameters.Algorithm, Parameters.Algorithms.NaiveBayes);

            var model = DocumentCategorizerME.Train("x-unspecified", samples, param, new DocumentCategorizerFactory(WhitespaceTokenizer.Instance, new [] { new BagOfWordsFeatureGenerator() }));

            var doccat = new DocumentCategorizerME(model);

            var aProbs = doccat.Categorize("a");

            Assert.AreEqual("1", doccat.GetBestCategory(aProbs));

            var bProbs = doccat.Categorize("x");

            Assert.AreEqual("0", doccat.GetBestCategory(bProbs));

            //test to make sure sorted map's last key is cat 1 because it has the highest score.
            var sortedScoreMap = doccat.SortedScoreMap("a");

            var last = sortedScoreMap.Last();

            Assert.AreEqual("1", last.Value[0]);
        }
 private static DocumentCategorizerModel Train(DocumentCategorizerFactory factory = null) {
     return DocumentCategorizerME.Train(
         "x-unspecified",
         CreateSampleStream(),
         TrainingParameters.DefaultParameters(),
         factory ?? new DocumentCategorizerFactory());
 }
        public SentimentAnalyzeResult Analyze()
        {
            var model     = Train();
            var category  = new DocumentCategorizerME(model);
            var evaluator = new DocumentCategorizerEvaluator(category);
            //var expectedDocumentCategory = "Movies";
            var content           = GetSourceText();
            var sample            = new DocumentSample("Call", content);
            var distribution      = category.Categorize(content);
            var predictedCategory = category.GetBestCategory(distribution);

            using (var stream = new FileStream(Path.Combine(AppConfig.GetAppBasePath(), "en-sentiment.bin"), FileMode.Append))
                SerializeHelper.Serialize(stream, model);
            return(predictedCategory.ConvertRawResultAsSentimentResult());
        }
        public CategoryMappedDocumentCategorizerModel(DocumentCategorizerME categorizer, Dictionary <string, string> mappedCategories)
        {
            Categorizer      = categorizer;
            MappedCategories = mappedCategories;

            Debug.Assert(Categorizer != null, "Categorizer cannot be null!");

            Debug.Assert(MappedCategories != null, "Mapped categories cannot be null!");

            Debug.Assert(MappedCategories.Count > 0, "Mapped categories must contain one or more entries!");

            if (Categorizer == null)
            {
                throw new ArgumentException(nameof(categorizer), "Categorizer cannot be null!");
            }

            if (MappedCategories == null)
            {
                throw new ArgumentException(nameof(mappedCategories), "Mapped categories cannot be null!");
            }

            if (MappedCategories.Count <= 0)
            {
                throw new ArgumentException(nameof(categorizer), "Mapped categories must contain one or more entries!");
            }

            var totalInternalCats = categorizer.getNumberOfCategories();

            Debug.Assert(totalInternalCats == MappedCategories.Count, "Mapped categories have the same number of entries as it's corresponding categorizer!");

            if (totalInternalCats != MappedCategories.Count)
            {
                var whichArgName = totalInternalCats < MappedCategories.Count ? nameof(totalInternalCats) : nameof(mappedCategories);
                throw new ArgumentException(nameof(MappedCategories), "Mapped categories have the same number of entries as it's corresponding categorizer!");
            }

            for (int i = 0; i < totalInternalCats; ++i)
            {
                var internalCatName = Categorizer.getCategory(i);
                Debug.Assert(MappedCategories.ContainsKey(internalCatName) == true, "Found unmapped category!");

                if (MappedCategories.ContainsKey(internalCatName) == false)
                {
                    throw new ArgumentException(nameof(mappedCategories), "Found unmapped category!");
                }
            }
        }
Example #5
0
        public override void run(string[] args)
        {
            if (0 == args.Length)
            {
                Console.WriteLine(Help);
            }
            else
            {
                DoccatModel model = (new DoccatModelLoader()).load(new File(args[0]));

                DocumentCategorizerME doccat = new DocumentCategorizerME(model);

                ObjectStream <string> documentStream = new ParagraphStream(new PlainTextByLineStream(new InputStreamReader(Console.OpenStandardInput)));

                PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
                perfMon.start();

                try
                {
                    string document;
                    while ((document = documentStream.read()) != null)
                    {
                        double[] prob     = doccat.categorize(WhitespaceTokenizer.INSTANCE.tokenize(document));
                        string   category = doccat.getBestCategory(prob);

                        DocumentSample sample = new DocumentSample(category, document);
                        Console.WriteLine(sample.ToString());

                        perfMon.incrementCounter();
                    }
                }
                catch (IOException e)
                {
                    CmdLineUtil.handleStdinIoError(e);
                }

                perfMon.stopAndPrintFinalResult();
            }
        }
Example #6
0
        public void TestSimpleTraining()
        {
            IObjectStream <DocumentSample> samples = new GenericObjectStream <DocumentSample>(new[] {
                new DocumentSample("1", new[] { "a", "b", "c" }),
                new DocumentSample("1", new[] { "a", "b", "c", "1", "2" }),
                new DocumentSample("1", new[] { "a", "b", "c", "3", "4" }),
                new DocumentSample("0", new[] { "x", "y", "z" }),
                new DocumentSample("0", new[] { "x", "y", "z", "5", "6" }),
                new DocumentSample("0", new[] { "x", "y", "z", "7", "8" })
            });

            var param = new TrainingParameters();

            param.Set(Parameters.Iterations, "100");
            param.Set(Parameters.Cutoff, "0");

            var model = DocumentCategorizerME.Train("x-unspecified", samples, param, new DocumentCategorizerFactory());

            var doccat = new DocumentCategorizerME(model);

            var aProbs = doccat.Categorize("a");

            Assert.AreEqual("1", doccat.GetBestCategory(aProbs));

            var bProbs = doccat.Categorize("x");

            Assert.AreEqual("0", doccat.GetBestCategory(bProbs));

            //test to make sure sorted map's last key is cat 1 because it has the highest score.
            var sortedScoreMap = doccat.SortedScoreMap("a");

            // first 0
            // second 1 (last)
            foreach (var pair in sortedScoreMap)
            {
                Assert.AreEqual("0", pair.Value[0]);
                break;
            }
        }
 public DocumentCategorizerModel Train(DocumentCategorizerFactory factory = null)
 {
     return(DocumentCategorizerME.Train("en", GetSentimentModelStream(),
                                        TrainingParameters.DefaultParameters(),
                                        factory != null ? factory : new DocumentCategorizerFactory()));
 }