private void Init(PipeModel meta) { if (_classifier == null) { meta.Model = "intent.model"; var options = new ClassifyOptions { ModelFilePath = Path.Combine(Settings.ModelDir, meta.Model), ModelDir = Settings.ModelDir, ModelName = meta.Model, Word2VecFilePath = Configuration.GetValue <string>("wordvecModel") }; if (!String.IsNullOrEmpty(options.Word2VecFilePath)) { string contentDir = AppDomain.CurrentDomain.GetData("DataPath").ToString(); options.Word2VecFilePath = options.Word2VecFilePath.Replace("|App_Data|", contentDir + System.IO.Path.DirectorySeparatorChar); } _classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); string classifierName = Configuration.GetValue <String>($"classifer"); _classifier.GetClassifer(classifierName); } }
public async Task <bool> Predict(Agent agent, NlpDoc doc, PipeModel meta) { var options = new ClassifyOptions { ModelFilePath = Path.Combine(Settings.ModelDir, meta.Model) }; var classifier = new ClassifierFactory <NaiveBayesClassifier, SentenceFeatureExtractor>(options, SupportedLanguage.English); var sentence = doc.Sentences.Select(s => new Sentence { Text = s.Text, Words = s.Tokens }).First(); var result = classifier.Classify(sentence); doc.Sentences[0].Intent = new TextClassificationResult { Classifier = "BotSharpNBayesClassifier", Label = result.First().Item1, Confidence = (decimal)result.First().Item2 }; return(true); }
public void WeirdColumnTest() { Stream weirdStream = TestUtils.RetrieveResource(weirdColumnResource); var weirdConfig = GetWeirdConfig(); CCRecordSet recordSet = CCRecordSet.FromStream(weirdStream, weirdConfig); var nbc = ClassifierFactory.GetClassifierByName <string>("NaiveBayesClassifier"); nbc.Train(recordSet.Select(rec => new KeyValuePair <string, string>(rec.Description, rec.PredictedValues["Fish"])).ToList()); Assert.AreEqual(nbc.Categorize("Seattle Lounge").Category, "Trout"); }
/// <summary> /// Creates a tag provider for the specified buffer /// </summary> /// <typeparam name="T">The tag type</typeparam> /// <param name="buffer">The text buffer</param> /// <returns>The tag provider for the specified buffer or null if the buffer is null or the spelling /// service is unavailable.</returns> public ITagger <T> CreateTagger <T>(ITextBuffer buffer) where T : ITag { if (buffer == null || buffer.ContentType.IsOfType("R Markdown")) { return(null); } #pragma warning disable VSTHRD010 var config = SpellingServiceProxy.GetConfiguration(buffer); #pragma warning restore VSTHRD010 if (config == null) { return(null); } // Markdown has its own tagger if (buffer.ContentType.IsOfType("Markdown")) { return(new MarkdownTextTagger(buffer, classifierAggregatorService.GetClassifier(buffer), config.IgnoredClassificationsFor(buffer.ContentType.TypeName)) as ITagger <T>); } // Due to an issue with the built-in C# classifier, we avoid using it. This also lets us provide // configuration options to exclude certain elements from being spell checked if not wanted. // Through the configuration options, we can also specify this tagger be used for all C-style // code. Not all configuration options will apply but the structure is similar enough to make // most of them relevant. string filename = buffer.GetFilename(); if (buffer.ContentType.IsOfType("csharp") || (config.CSharpOptions.ApplyToAllCStyleLanguages && ClassifierFactory.IsCStyleCode(filename))) { // The C# options are passed to the tagger for local use since it tracks the state of the // lines in the buffer. Changing the global options will require that any open editors be // closed and reopened for the changes to take effect. return(new CSharpCommentTextTagger(buffer) { SupportsOldStyleXmlDocComments = ClassifierFactory.SupportsOldStyleXmlDocComments(filename), IgnoreXmlDocComments = config.CSharpOptions.IgnoreXmlDocComments, IgnoreDelimitedComments = config.CSharpOptions.IgnoreDelimitedComments, IgnoreStandardSingleLineComments = config.CSharpOptions.IgnoreStandardSingleLineComments, IgnoreQuadrupleSlashComments = config.CSharpOptions.IgnoreQuadrupleSlashComments, IgnoreNormalStrings = config.CSharpOptions.IgnoreNormalStrings, IgnoreVerbatimStrings = config.CSharpOptions.IgnoreVerbatimStrings, IgnoreInterpolatedStrings = config.CSharpOptions.IgnoreInterpolatedStrings, IgnoredXmlElements = config.IgnoredXmlElements, SpellCheckedAttributes = config.SpellCheckedXmlAttributes } as ITagger <T>); } return(new CommentTextTagger(buffer, classifierAggregatorService.GetClassifier(buffer), config.IgnoredXmlElements, config.SpellCheckedXmlAttributes, config.IgnoredClassificationsFor(buffer.ContentType.TypeName)) as ITagger <T>); }
public void NaiveBayesClassifierSimpleTest() { var records = KeyValuePairFromTsv(bookExample); var classifier = ClassifierFactory.GetClassifierByName <Genre>("NaiveBayesClassifier"); classifier.Train(records); Assert.AreEqual(classifier.Categorize("Curtains and Drapes").Category, Genre.INTERIORDECORATING); Assert.AreEqual(classifier.Categorize("The Ventilation of Bridges").Category, Genre.ENGINEERING); Assert.AreEqual(classifier.Categorize("Tax Accounting").Category, Genre.ACCOUNTING); }
public void CookingTest() { var reader = new FasttextDataReader(); var sentences = reader.Read(new ReaderOptions { DataDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), FileName = "cooking.stackexchange.txt" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); sentences.Shuffle(); var options = new ClassifyOptions { ModelFilePath = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"), TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), Dimension = 100 }; var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); var dataset = sentences.Split(0.7M); classifier.Train(dataset.Item1); int correct = 0; int total = 0; dataset.Item2.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } total++; }); var accuracy = (float)correct / total; Assert.IsTrue(accuracy > 0.5); }
public void SpookyAuthorIdentification() { var reader = new KaggleTextDataReader(); var sentences = reader.Read(new ReaderOptions { FileName = "train.csv" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Id = sentences[i].Id; newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); sentences.Shuffle(); var dataset = sentences.Take(2000).ToList().Split(0.7M); var options = new ClassifyOptions { ModelDir = AppContext.BaseDirectory, ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"), Dimension = 300 }; var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); classifier.GetClassifer("NaiveBayesClassifier"); classifier.Train(dataset.Item1); int correct = 0; int total = 0; dataset.Item2.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } total++; }); var accuracy = (float)correct / total; Assert.IsTrue(accuracy > 0.5); }
//===================================================================== /// <summary> /// Constructor /// </summary> /// <param name="buffer">The text buffer</param> /// <param name="view">The text view</param> /// <param name="naturalTextAggregator">The tag aggregator</param> /// <param name="urlAggregator">The URL aggregator</param> /// <param name="configuration">The spell checker configuration to use</param> /// <param name="dictionary">The spelling dictionary to use</param> public SpellingTagger(ITextBuffer buffer, ITextView view, ITagAggregator <INaturalTextTag> naturalTextAggregator, ITagAggregator <IUrlTag> urlAggregator, SpellCheckerConfiguration configuration, SpellingDictionary dictionary) { _isClosed = false; _buffer = buffer; _naturalTextAggregator = naturalTextAggregator; _urlAggregator = urlAggregator; _dispatcher = Dispatcher.CurrentDispatcher; this.configuration = configuration; _dictionary = dictionary; _dirtySpans = new List <SnapshotSpan>(); _misspellings = new List <MisspellingTag>(); wordsIgnoredOnce = new List <IgnoredOnceWord>(); inlineIgnoredWords = new List <InlineIgnoredWord>(); string filename = buffer.GetFilename(); wordSplitter = new WordSplitter { Configuration = configuration, Mnemonic = ClassifierFactory.GetMnemonic(filename), IsCStyleCode = ClassifierFactory.IsCStyleCode(filename) }; _buffer.Changed += BufferChanged; _naturalTextAggregator.TagsChanged += AggregatorTagsChanged; _urlAggregator.TagsChanged += AggregatorTagsChanged; _dictionary.DictionaryUpdated += DictionaryUpdated; _dictionary.ReplaceAll += ReplaceAll; _dictionary.IgnoreOnce += IgnoreOnce; view.Closed += ViewClosed; // Strings in SQL script can contain escaped single quotes which are apostrophes. Unescape them // so that they are spell checked correctly. unescapeApostrophes = buffer.ContentType.IsOfType("SQL Server Tools"); // To start with, the entire buffer is dirty. Split this into chunks so we update pieces at a time. ITextSnapshot snapshot = _buffer.CurrentSnapshot; foreach (var line in snapshot.Lines) { AddDirtySpan(line.Extent); } }
public void GenderTest() { var options = new ClassifyOptions { TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Gender") }; var classifier = new ClassifierFactory <WordFeatureExtractor>(options, SupportedLanguage.English); var corpus = GetLabeledCorpus(options); var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); corpus.ForEach(x => x.Words = tokenizer.Tokenize(x.Text)); classifier.Train(corpus); string text = "Bridget"; classifier.Classify(new Sentence { Text = text, Words = tokenizer.Tokenize(text) }); corpus.Shuffle(); var trainingData = corpus.Skip(2000).ToList(); classifier.Train(trainingData); var testData = corpus.Take(2000).ToList(); int correct = 0; testData.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } }); var accuracy = (float)correct / testData.Count; }
public void NaiveBayesClassificationIntegrationTest() { Stream oldRecordsStream = TestUtils.RetrieveResource(fullChargeList); CCRecordSet records = CCRecordSet.FromStream(oldRecordsStream, config); var nbc = ClassifierFactory.GetClassifierByName <string>("NaiveBayesClassifier"); var trainingData = records .Select(rec => new KeyValuePair <string, string>(rec.Description, rec.PredictedValues["Category"])) .ToList(); nbc.Train(trainingData); Assert.AreEqual(nbc.Categorize("Trader Joe's").Category, "GROC"); Assert.AreEqual(nbc.Categorize("Shell Oil 27440482209 Seattle Wa").Category, "TRANS"); trainingData = records .Select(rec => new KeyValuePair <string, string>(rec.Description, rec.PredictedValues["Owner"])) .ToList(); nbc = ClassifierFactory.GetClassifierByName <string>("NaiveBayesClassifier"); nbc.Train(trainingData); Assert.AreEqual(nbc.Categorize("Radio Shack 00133652 Knoxville").Category, "Bob"); }
private void Init(PipeModel meta) { if (_classifier == null) { meta.Model = "intent.model"; var options = new ClassifyOptions { ModelFilePath = Path.Combine(Settings.ModelDir, meta.Model), ModelDir = Settings.ModelDir, ModelName = meta.Model }; _classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); string classifierName = Configuration.GetValue <String>($"classifer"); _classifier.GetClassifer(classifierName); } }
internal static CCRecordSet ClassifyAndUpdate(CCRecordSet oldRecordSet, List <CCRecord> newRecords, IRecordReader reader, C3Configuration config) { var classifers = new Dictionary <C3PredictedColumn, IClassifier <string> >(); foreach (C3PredictedColumn predictedColumn in config.columns) { var trainingData = oldRecordSet .Select(rec => new KeyValuePair <string, string>(rec.Description, rec.PredictedValues[predictedColumn.columnName])) .ToList(); var classifier = ClassifierFactory.GetClassifierByName <string>(predictedColumn.classifierName); Utils.Log(LoggingSeverity.DEBUG, $"Training {predictedColumn.classifierName} on column '{predictedColumn.columnName}' with {trainingData.Count} records"); classifier.Train(trainingData); classifers.Add(predictedColumn, classifier); } AppendRecords(oldRecordSet, newRecords, classifers); return(oldRecordSet); }
public override CommandState QueryState(CommandContext context) { if (Settings.MissingKeys()) { return(CommandState.Disabled); } Item ctxItem = DataWrapper?.ExtractItem(context); if (ctxItem == null || ctxItem.TemplateID.Guid != Settings.ClassifierTemplateId.Guid) { return(CommandState.Hidden); } var classifier = ClassifierFactory.Create(ctxItem); var isTrainable = classifier is IClassTrainer; return((isTrainable) ? CommandState.Enabled : CommandState.Hidden); }
private static BiometricResult Evaluate <T>(SampleSet <T> sampleSet, ClassifierFactory <T> classifier, int trainingSize, double minInterval, double interval, BiometricResult result, int xValidationStart, int xValidationLength) where T : ISample { if (interval < minInterval) { // the threshold found for the smallest interval represents the best guess at the EER return(result); } else { // recurse until the smallest allowable interval is found Tuple <ErrorRatePair, List <ErrorRatePair> > delta1 = CalculateErrorRate(sampleSet, classifier, trainingSize, (result.GetThreshold() - (interval / 2.0)), xValidationStart, xValidationLength); Tuple <ErrorRatePair, List <ErrorRatePair> > delta2 = CalculateErrorRate(sampleSet, classifier, trainingSize, (result.GetThreshold() + (interval / 2.0)), xValidationStart, xValidationLength); if (delta1.Item1.GetErrorDelta() < delta2.Item1.GetErrorDelta()) { return(Evaluate(sampleSet, classifier, trainingSize, minInterval, (interval / 2.0), new BiometricResult((result.GetThreshold() - (interval / 2.0)), delta1.Item2), xValidationStart, xValidationLength)); } else { return(Evaluate(sampleSet, classifier, trainingSize, minInterval, (interval / 2.0), new BiometricResult((result.GetThreshold() + (interval / 2.0)), delta2.Item2), xValidationStart, xValidationLength)); } } }
public async Task <bool> Train(Agent agent, NlpDoc doc, PipeModel meta) { meta.Model = "classification-nb.model"; string modelFileName = Path.Combine(Settings.ModelDir, meta.Model); var options = new ClassifyOptions { ModelFilePath = modelFileName }; var classifier = new ClassifierFactory <NaiveBayesClassifier, SentenceFeatureExtractor>(options, SupportedLanguage.English); var sentences = doc.Sentences.Select(x => new Sentence { Label = x.Intent.Label, Text = x.Text, Words = x.Tokens }).ToList(); classifier.Train(sentences); Console.WriteLine($"Saved model to {modelFileName}"); return(true); }
//===================================================================== /// <summary> /// Update the available file extensions based on the file type /// </summary> /// <param name="sender">The sender of the event</param> /// <param name="e">The event arguments</param> private void cboFileType_SelectionChanged(object sender, SelectionChangedEventArgs e) { cboExtension.ItemsSource = (new[] { "All" }).Concat( ClassifierFactory.ExtensionsFor((string)cboFileType.SelectedItem)); cboExtension.SelectedIndex = 0; }
public static BiometricResult Evaluate(ClassifierFactory <ISample> classifier, List <ISample> testingSamples) { return(Evaluate(classifier.GetInstance(0), testingSamples)); }
private static Tuple <ErrorRatePair, List <ErrorRatePair> > CalculateErrorRate <T>(SampleSet <T> sampleSet, ClassifierFactory <T> classifier, int trainingSize, double threshold, int xValidationStart, int xValidationLength) where T : ISample { // test classifier instances with testing samples and threshold asynchronously to get cross validated result List <ErrorRatePair> errorRates = Enumerable.Range(xValidationStart, xValidationLength).AsParallel() .Select(x => CalculateErrorRate(classifier.GetInstance(x), SampleSetHelpers.GetSampleSetTestingSamples(sampleSet, trainingSize, x), threshold)) .ToList(); ErrorRatePair totalError = errorRates[0]; for (int i = 1; i < errorRates.Count; i++) { totalError = totalError + errorRates[i]; } return(new Tuple <ErrorRatePair, List <ErrorRatePair> >(totalError, errorRates)); }