コード例 #1
0
        private void Init(PipeModel meta)
        {
            if (_classifier == null)
            {
                meta.Model = "intent.model";

                var options = new ClassifyOptions
                {
                    ModelFilePath    = Path.Combine(Settings.ModelDir, meta.Model),
                    ModelDir         = Settings.ModelDir,
                    ModelName        = meta.Model,
                    Word2VecFilePath = Configuration.GetValue <string>("wordvecModel")
                };

                if (!String.IsNullOrEmpty(options.Word2VecFilePath))
                {
                    string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString();
                    options.Word2VecFilePath = options.Word2VecFilePath.Replace("|App_Data|", contentDir + System.IO.Path.DirectorySeparatorChar);
                }

                _classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

                string classifierName = Configuration.GetValue <String>($"classifer");

                _classifier.GetClassifer(classifierName);
            }
        }
コード例 #2
0
        public async Task <bool> Predict(Agent agent, NlpDoc doc, PipeModel meta)
        {
            var options = new ClassifyOptions
            {
                ModelFilePath = Path.Combine(Settings.ModelDir, meta.Model)
            };
            var classifier = new ClassifierFactory <NaiveBayesClassifier, SentenceFeatureExtractor>(options, SupportedLanguage.English);

            var sentence = doc.Sentences.Select(s => new Sentence
            {
                Text  = s.Text,
                Words = s.Tokens
            }).First();


            var result = classifier.Classify(sentence);

            doc.Sentences[0].Intent = new TextClassificationResult
            {
                Classifier = "BotSharpNBayesClassifier",
                Label      = result.First().Item1,
                Confidence = (decimal)result.First().Item2
            };

            return(true);
        }
コード例 #3
0
        public void WeirdColumnTest()
        {
            Stream      weirdStream = TestUtils.RetrieveResource(weirdColumnResource);
            var         weirdConfig = GetWeirdConfig();
            CCRecordSet recordSet   = CCRecordSet.FromStream(weirdStream, weirdConfig);
            var         nbc         = ClassifierFactory.GetClassifierByName <string>("NaiveBayesClassifier");

            nbc.Train(recordSet.Select(rec => new KeyValuePair <string, string>(rec.Description, rec.PredictedValues["Fish"])).ToList());
            Assert.AreEqual(nbc.Categorize("Seattle Lounge").Category, "Trout");
        }
コード例 #4
0
            /// <summary>
            /// Creates a tag provider for the specified buffer
            /// </summary>
            /// <typeparam name="T">The tag type</typeparam>
            /// <param name="buffer">The text buffer</param>
            /// <returns>The tag provider for the specified buffer or null if the buffer is null or the spelling
            /// service is unavailable.</returns>
            public ITagger <T> CreateTagger <T>(ITextBuffer buffer) where T : ITag
            {
                if (buffer == null || buffer.ContentType.IsOfType("R Markdown"))
                {
                    return(null);
                }

#pragma warning disable VSTHRD010
                var config = SpellingServiceProxy.GetConfiguration(buffer);
#pragma warning restore VSTHRD010

                if (config == null)
                {
                    return(null);
                }

                // Markdown has its own tagger
                if (buffer.ContentType.IsOfType("Markdown"))
                {
                    return(new MarkdownTextTagger(buffer, classifierAggregatorService.GetClassifier(buffer),
                                                  config.IgnoredClassificationsFor(buffer.ContentType.TypeName)) as ITagger <T>);
                }

                // Due to an issue with the built-in C# classifier, we avoid using it.  This also lets us provide
                // configuration options to exclude certain elements from being spell checked if not wanted.
                // Through the configuration options, we can also specify this tagger be used for all C-style
                // code.  Not all configuration options will apply but the structure is similar enough to make
                // most of them relevant.
                string filename = buffer.GetFilename();

                if (buffer.ContentType.IsOfType("csharp") || (config.CSharpOptions.ApplyToAllCStyleLanguages &&
                                                              ClassifierFactory.IsCStyleCode(filename)))
                {
                    // The C# options are passed to the tagger for local use since it tracks the state of the
                    // lines in the buffer.  Changing the global options will require that any open editors be
                    // closed and reopened for the changes to take effect.
                    return(new CSharpCommentTextTagger(buffer)
                    {
                        SupportsOldStyleXmlDocComments = ClassifierFactory.SupportsOldStyleXmlDocComments(filename),
                        IgnoreXmlDocComments = config.CSharpOptions.IgnoreXmlDocComments,
                        IgnoreDelimitedComments = config.CSharpOptions.IgnoreDelimitedComments,
                        IgnoreStandardSingleLineComments = config.CSharpOptions.IgnoreStandardSingleLineComments,
                        IgnoreQuadrupleSlashComments = config.CSharpOptions.IgnoreQuadrupleSlashComments,
                        IgnoreNormalStrings = config.CSharpOptions.IgnoreNormalStrings,
                        IgnoreVerbatimStrings = config.CSharpOptions.IgnoreVerbatimStrings,
                        IgnoreInterpolatedStrings = config.CSharpOptions.IgnoreInterpolatedStrings,
                        IgnoredXmlElements = config.IgnoredXmlElements,
                        SpellCheckedAttributes = config.SpellCheckedXmlAttributes
                    } as ITagger <T>);
                }

                return(new CommentTextTagger(buffer, classifierAggregatorService.GetClassifier(buffer),
                                             config.IgnoredXmlElements, config.SpellCheckedXmlAttributes,
                                             config.IgnoredClassificationsFor(buffer.ContentType.TypeName)) as ITagger <T>);
            }
コード例 #5
0
        public void NaiveBayesClassifierSimpleTest()
        {
            var records    = KeyValuePairFromTsv(bookExample);
            var classifier = ClassifierFactory.GetClassifierByName <Genre>("NaiveBayesClassifier");

            classifier.Train(records);

            Assert.AreEqual(classifier.Categorize("Curtains and Drapes").Category, Genre.INTERIORDECORATING);
            Assert.AreEqual(classifier.Categorize("The Ventilation of Bridges").Category, Genre.ENGINEERING);
            Assert.AreEqual(classifier.Categorize("Tax Accounting").Category, Genre.ACCOUNTING);
        }
コード例 #6
0
        public void CookingTest()
        {
            var reader    = new FasttextDataReader();
            var sentences = reader.Read(new ReaderOptions
            {
                DataDir  = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                FileName = "cooking.stackexchange.txt"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();

            var options = new ClassifyOptions
            {
                ModelFilePath     = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"),
                TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                Dimension         = 100
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            var dataset = sentences.Split(0.7M);

            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }
コード例 #7
0
        public void SpookyAuthorIdentification()
        {
            var reader    = new KaggleTextDataReader();
            var sentences = reader.Read(new ReaderOptions {
                FileName = "train.csv"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Id    = sentences[i].Id;
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();
            var dataset = sentences.Take(2000).ToList().Split(0.7M);

            var options = new ClassifyOptions
            {
                ModelDir      = AppContext.BaseDirectory,
                ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"),
                Dimension     = 300
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            classifier.GetClassifer("NaiveBayesClassifier");
            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }
コード例 #8
0
        //=====================================================================

        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="buffer">The text buffer</param>
        /// <param name="view">The text view</param>
        /// <param name="naturalTextAggregator">The tag aggregator</param>
        /// <param name="urlAggregator">The URL aggregator</param>
        /// <param name="configuration">The spell checker configuration to use</param>
        /// <param name="dictionary">The spelling dictionary to use</param>
        public SpellingTagger(ITextBuffer buffer, ITextView view,
                              ITagAggregator <INaturalTextTag> naturalTextAggregator, ITagAggregator <IUrlTag> urlAggregator,
                              SpellCheckerConfiguration configuration, SpellingDictionary dictionary)
        {
            _isClosed = false;
            _buffer   = buffer;
            _naturalTextAggregator = naturalTextAggregator;
            _urlAggregator         = urlAggregator;
            _dispatcher            = Dispatcher.CurrentDispatcher;
            this.configuration     = configuration;
            _dictionary            = dictionary;

            _dirtySpans        = new List <SnapshotSpan>();
            _misspellings      = new List <MisspellingTag>();
            wordsIgnoredOnce   = new List <IgnoredOnceWord>();
            inlineIgnoredWords = new List <InlineIgnoredWord>();

            string filename = buffer.GetFilename();

            wordSplitter = new WordSplitter
            {
                Configuration = configuration,
                Mnemonic      = ClassifierFactory.GetMnemonic(filename),
                IsCStyleCode  = ClassifierFactory.IsCStyleCode(filename)
            };

            _buffer.Changed += BufferChanged;
            _naturalTextAggregator.TagsChanged += AggregatorTagsChanged;
            _urlAggregator.TagsChanged         += AggregatorTagsChanged;
            _dictionary.DictionaryUpdated      += DictionaryUpdated;
            _dictionary.ReplaceAll             += ReplaceAll;
            _dictionary.IgnoreOnce             += IgnoreOnce;

            view.Closed += ViewClosed;

            // Strings in SQL script can contain escaped single quotes which are apostrophes.  Unescape them
            // so that they are spell checked correctly.
            unescapeApostrophes = buffer.ContentType.IsOfType("SQL Server Tools");

            // To start with, the entire buffer is dirty.  Split this into chunks so we update pieces at a time.
            ITextSnapshot snapshot = _buffer.CurrentSnapshot;

            foreach (var line in snapshot.Lines)
            {
                AddDirtySpan(line.Extent);
            }
        }
コード例 #9
0
        public void GenderTest()
        {
            var options = new ClassifyOptions
            {
                TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Gender")
            };
            var classifier = new ClassifierFactory <WordFeatureExtractor>(options, SupportedLanguage.English);

            var corpus = GetLabeledCorpus(options);

            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            corpus.ForEach(x => x.Words = tokenizer.Tokenize(x.Text));

            classifier.Train(corpus);
            string text = "Bridget";

            classifier.Classify(new Sentence {
                Text = text, Words = tokenizer.Tokenize(text)
            });

            corpus.Shuffle();
            var trainingData = corpus.Skip(2000).ToList();

            classifier.Train(trainingData);

            var testData = corpus.Take(2000).ToList();
            int correct  = 0;

            testData.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
            });

            var accuracy = (float)correct / testData.Count;
        }
コード例 #10
0
        public void NaiveBayesClassificationIntegrationTest()
        {
            Stream      oldRecordsStream = TestUtils.RetrieveResource(fullChargeList);
            CCRecordSet records          = CCRecordSet.FromStream(oldRecordsStream, config);
            var         nbc          = ClassifierFactory.GetClassifierByName <string>("NaiveBayesClassifier");
            var         trainingData = records
                                       .Select(rec => new KeyValuePair <string, string>(rec.Description, rec.PredictedValues["Category"]))
                                       .ToList();

            nbc.Train(trainingData);
            Assert.AreEqual(nbc.Categorize("Trader Joe's").Category, "GROC");
            Assert.AreEqual(nbc.Categorize("Shell Oil 27440482209 Seattle Wa").Category, "TRANS");

            trainingData = records
                           .Select(rec => new KeyValuePair <string, string>(rec.Description, rec.PredictedValues["Owner"]))
                           .ToList();
            nbc = ClassifierFactory.GetClassifierByName <string>("NaiveBayesClassifier");
            nbc.Train(trainingData);
            Assert.AreEqual(nbc.Categorize("Radio Shack 00133652 Knoxville").Category, "Bob");
        }
コード例 #11
0
        private void Init(PipeModel meta)
        {
            if (_classifier == null)
            {
                meta.Model = "intent.model";

                var options = new ClassifyOptions
                {
                    ModelFilePath = Path.Combine(Settings.ModelDir, meta.Model),
                    ModelDir      = Settings.ModelDir,
                    ModelName     = meta.Model
                };

                _classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

                string classifierName = Configuration.GetValue <String>($"classifer");

                _classifier.GetClassifer(classifierName);
            }
        }
コード例 #12
0
ファイル: Updater.cs プロジェクト: natesternberg/C3
        internal static CCRecordSet ClassifyAndUpdate(CCRecordSet oldRecordSet, List <CCRecord> newRecords,
                                                      IRecordReader reader, C3Configuration config)
        {
            var classifers = new Dictionary <C3PredictedColumn, IClassifier <string> >();

            foreach (C3PredictedColumn predictedColumn in config.columns)
            {
                var trainingData = oldRecordSet
                                   .Select(rec => new KeyValuePair <string, string>(rec.Description, rec.PredictedValues[predictedColumn.columnName]))
                                   .ToList();

                var classifier = ClassifierFactory.GetClassifierByName <string>(predictedColumn.classifierName);
                Utils.Log(LoggingSeverity.DEBUG,
                          $"Training {predictedColumn.classifierName} on column '{predictedColumn.columnName}' with {trainingData.Count} records");
                classifier.Train(trainingData);
                classifers.Add(predictedColumn, classifier);
            }

            AppendRecords(oldRecordSet, newRecords, classifers);
            return(oldRecordSet);
        }
        public override CommandState QueryState(CommandContext context)
        {
            if (Settings.MissingKeys())
            {
                return(CommandState.Disabled);
            }

            Item ctxItem = DataWrapper?.ExtractItem(context);

            if (ctxItem == null || ctxItem.TemplateID.Guid != Settings.ClassifierTemplateId.Guid)
            {
                return(CommandState.Hidden);
            }

            var classifier  = ClassifierFactory.Create(ctxItem);
            var isTrainable = classifier is IClassTrainer;

            return((isTrainable)
                ? CommandState.Enabled
                : CommandState.Hidden);
        }
コード例 #14
0
        private static BiometricResult Evaluate <T>(SampleSet <T> sampleSet, ClassifierFactory <T> classifier, int trainingSize, double minInterval, double interval, BiometricResult result, int xValidationStart, int xValidationLength) where T : ISample
        {
            if (interval < minInterval)
            {
                // the threshold found for the smallest interval represents the best guess at the EER
                return(result);
            }
            else
            {
                // recurse until the smallest allowable interval is found
                Tuple <ErrorRatePair, List <ErrorRatePair> > delta1 = CalculateErrorRate(sampleSet, classifier, trainingSize, (result.GetThreshold() - (interval / 2.0)), xValidationStart, xValidationLength);
                Tuple <ErrorRatePair, List <ErrorRatePair> > delta2 = CalculateErrorRate(sampleSet, classifier, trainingSize, (result.GetThreshold() + (interval / 2.0)), xValidationStart, xValidationLength);

                if (delta1.Item1.GetErrorDelta() < delta2.Item1.GetErrorDelta())
                {
                    return(Evaluate(sampleSet, classifier, trainingSize, minInterval, (interval / 2.0), new BiometricResult((result.GetThreshold() - (interval / 2.0)), delta1.Item2), xValidationStart, xValidationLength));
                }
                else
                {
                    return(Evaluate(sampleSet, classifier, trainingSize, minInterval, (interval / 2.0), new BiometricResult((result.GetThreshold() + (interval / 2.0)), delta2.Item2), xValidationStart, xValidationLength));
                }
            }
        }
コード例 #15
0
        public async Task <bool> Train(Agent agent, NlpDoc doc, PipeModel meta)
        {
            meta.Model = "classification-nb.model";
            string modelFileName = Path.Combine(Settings.ModelDir, meta.Model);

            var options = new ClassifyOptions
            {
                ModelFilePath = modelFileName
            };
            var classifier = new ClassifierFactory <NaiveBayesClassifier, SentenceFeatureExtractor>(options, SupportedLanguage.English);

            var sentences = doc.Sentences.Select(x => new Sentence
            {
                Label = x.Intent.Label,
                Text  = x.Text,
                Words = x.Tokens
            }).ToList();

            classifier.Train(sentences);

            Console.WriteLine($"Saved model to {modelFileName}");

            return(true);
        }
コード例 #16
0
        //=====================================================================

        /// <summary>
        /// Update the available file extensions based on the file type
        /// </summary>
        /// <param name="sender">The sender of the event</param>
        /// <param name="e">The event arguments</param>
        private void cboFileType_SelectionChanged(object sender, SelectionChangedEventArgs e)
        {
            cboExtension.ItemsSource = (new[] { "All" }).Concat(
                ClassifierFactory.ExtensionsFor((string)cboFileType.SelectedItem));
            cboExtension.SelectedIndex = 0;
        }
コード例 #17
0
 public static BiometricResult Evaluate(ClassifierFactory <ISample> classifier, List <ISample> testingSamples)
 {
     return(Evaluate(classifier.GetInstance(0), testingSamples));
 }
コード例 #18
0
        private static Tuple <ErrorRatePair, List <ErrorRatePair> > CalculateErrorRate <T>(SampleSet <T> sampleSet, ClassifierFactory <T> classifier, int trainingSize, double threshold, int xValidationStart, int xValidationLength) where T : ISample
        {
            // test classifier instances with testing samples and threshold asynchronously to get cross validated result
            List <ErrorRatePair> errorRates = Enumerable.Range(xValidationStart, xValidationLength).AsParallel()
                                              .Select(x => CalculateErrorRate(classifier.GetInstance(x), SampleSetHelpers.GetSampleSetTestingSamples(sampleSet, trainingSize, x), threshold))
                                              .ToList();

            ErrorRatePair totalError = errorRates[0];

            for (int i = 1; i < errorRates.Count; i++)
            {
                totalError = totalError + errorRates[i];
            }
            return(new Tuple <ErrorRatePair, List <ErrorRatePair> >(totalError, errorRates));
        }