private static async Task Main() { //Initialize the English built-in models Catalyst.Models.English.Register(); //Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); // Catalyst currently supports 3 different types of models for Named Entity Recognition (NER): // - Gazetteer-like(i.e. [Spotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/Spotter.cs)) // - Regex-like(i.e. [PatternSpotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/PatternSpotter.cs)) // - Perceptron (i.e. [AveragePerceptronEntityRecognizer](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/AveragePerceptronEntityRecognizer.cs)) //var s = typeof(Catalyst.Models.English).Assembly.GetManifestResourceStream($"{typeof(Catalyst.Models.English).Assembly.GetName().Name}.Resources.sentence-detector.bin"); //foreach(var name in typeof(Catalyst.Models.English).Assembly.GetManifestResourceNames()) //{ // Console.WriteLine(name); //} var sd = await SentenceDetector.FromStoreAsync(Language.English, -1, ""); var a = new AveragePerceptronTagger(Language.English, 0, ""); await a.LoadDataAsync(); var p = await AveragePerceptronTagger.FromStoreAsync(Language.English, -1, ""); await DemonstrateAveragePerceptronEntityRecognizerAndPatternSpotter(); DemonstrateSpotter(); }
static async Task Main(string[] args) { Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); //Configures the model storage to use the online repository backed by the local folder ./catalyst-models/ Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); //Download the Reuters corpus if necessary var(train, test) = await Corpus.Reuters.GetAsync(); //Parse the documents using the English pipeline, as the text data is untokenized so far var nlp = Pipeline.For(Language.English); var trainDocs = nlp.Process(train).ToArray(); var testDocs = nlp.Process(test).ToArray(); //Train a FastText supervised classifier with a multi-label loss (OneVsAll) var fastText = new FastText(Language.English, 0, "Reuters-Classifier"); fastText.Data.Type = FastText.ModelType.Supervised; fastText.Data.Loss = FastText.LossType.OneVsAll; fastText.Data.LearningRate = 1f; fastText.Data.Dimensions = 256; fastText.Data.Epoch = 100; fastText.Data.MinimumWordNgramsCounts = 5; fastText.Data.MaximumWordNgrams = 3; fastText.Data.MinimumCount = 5; fastText.Train(trainDocs); //You can also auto-tune the model using the algorithm from https://ai.facebook.com/blog/fasttext-blog-post-open-source-in-brief/ fastText.AutoTuneTrain(trainDocs, testDocs, new FastText.AutoTuneOptions()); //Compute predictions Dictionary <IDocument, Dictionary <string, float> > predTrain, predTest; using (new Measure(Logger, "Computing train-set predictions", trainDocs.Length)) { predTrain = trainDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred); } using (new Measure(Logger, "Computing test set predictions", testDocs.Length)) { predTest = testDocs.AsParallel().Select(d => (Doc: d, Pred: fastText.Predict(d))).ToDictionary(d => d.Doc, d => d.Pred); } var resultsTrain = ComputeStats(predTrain); var resultsTest = ComputeStats(predTest); Console.WriteLine("\n\n\n--- Results ---\n\n\n"); foreach (var res in resultsTrain.Zip(resultsTest)) { Console.WriteLine($"\tScore cutoff: {res.First.Cutoff:n2} Train: F1={res.First.F1:n2} P={res.First.Precision:n2} R={res.First.Recall:n2} Test: F1={res.Second.F1:n2} P={res.Second.Precision:n2} R={res.Second.Recall:n2}"); } Console.ReadLine(); }
static async Task Main(string[] args) { ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(builder => builder.AddConsole())); ForceInvariantCultureAndUTF8Output(); await Parser.Default .ParseArguments <CommandLineOptions>(args) .MapResult( async options => { if (string.IsNullOrWhiteSpace(options.Token)) { Storage.Current = new DiskStorage(options.DiskStoragePath); } else { //For uploading on the online models repository Storage.Current = new OnlineWriteableRepositoryStorage(new DiskStorage(options.DiskStoragePath), options.Token); } Thread.CurrentThread.Priority = ThreadPriority.AboveNormal; using (var p = Process.GetCurrentProcess()) { p.PriorityClass = ProcessPriorityClass.High; } if (!string.IsNullOrWhiteSpace(options.UniversalDependenciesPath)) { TrainSentenceDetector.Train(options.UniversalDependenciesPath); TrainPOSTagger.Train(udSource: options.UniversalDependenciesPath, ontonotesSource: options.OntonotesPath); } if (!string.IsNullOrWhiteSpace(options.WikiNERPath)) { await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.English, 0, "WikiNER"); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.French, 0, "WikiNER"); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.German, 0, "WikiNER"); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Spanish, 0, "WikiNER"); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Italian, 0, "WikiNER"); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Portuguese, 0, "WikiNER"); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Russian, 0, "WikiNER"); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Dutch, 0, "WikiNER"); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Polish, 0, "WikiNER"); } if (!string.IsNullOrWhiteSpace(options.FastTextLanguageSentencesPath)) { TrainLanguageDetector.Train(options.FastTextLanguageSentencesPath); TrainLanguageDetector.Test(options.FastTextLanguageSentencesPath); } if (!string.IsNullOrWhiteSpace(options.LanguageJsonPath)) { TrainLanguageDetector.CreateLanguageDetector(options.LanguageJsonPath); } }, error => Task.CompletedTask); }
static void Main(string[] args) { /*var dTTest = new DTTest(); * dTTest.Test();*/ ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); var nnTest = new NNTest(); nnTest.Test(); }
public static async Task Main(string[] args) { Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); //Need to register the languages we want to use first Catalyst.Models.English.Register(); //This example shows the two language detection models available on Catalyst. //The first is derived from the Chrome former language detection code Compact Language Detector 2 (https://github.com/CLD2Owners/cld2) //and the newer model is derived from Facebook's FastText language detection dataset (see: https://fasttext.cc/blog/2017/10/02/blog-post.html) //Configures the model storage to use the local folder ./catalyst-models/ Storage.Current = new DiskStorage("catalyst-models"); var cld2LanguageDetector = await LanguageDetector.FromStoreAsync(Language.Any, Version.Latest, ""); var fastTextLanguageDetector = await FastTextLanguageDetector.FromStoreAsync(Language.Any, Version.Latest, ""); //We show bellow the detection on short and longer samples. You can expect lower precision on shorter texts, as there is less information for the model to work with //It's also interesting to see the kind of mistakes these models make, such as detecting Welsh as Gaelic_Scottish_Gaelic foreach (var(lang, text) in Data.ShortSamples) { var doc = new Document(text); fastTextLanguageDetector.Process(doc); var doc2 = new Document(text); cld2LanguageDetector.Process(doc2); Console.WriteLine(text); Console.WriteLine($"Actual:\t{lang}\nFT:\t{doc.Language}\nCLD2\t{doc2.Language}"); Console.WriteLine(); } foreach (var(lang, text) in Data.LongSamples) { var doc = new Document(text); fastTextLanguageDetector.Process(doc); var doc2 = new Document(text); cld2LanguageDetector.Process(doc2); Console.WriteLine(text); Console.WriteLine($"Actual:\t{lang}\nFT:\t{doc.Language}\nCLD2\t{doc2.Language}"); Console.WriteLine(); } // You can also access all predictions via the Predict method: var allPredictions = fastTextLanguageDetector.Predict(new Document(Data.LongSamples[Language.Spanish])); Console.WriteLine($"\n\nTop 10 predictions and scores for the Spanish sample:"); foreach (var kv in allPredictions.OrderByDescending(kv => kv.Value).Take(10)) { Console.WriteLine($"{kv.Key.ToString().PadRight(40)}\tScore: {kv.Value:n2}"); } }
public static async Task Main(string[] args) { Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); //This example uses the WikiNER model, trained on the data provided by the paper "Learning multilingual named entity recognition from Wikipedia", Artificial Intelligence 194 (DOI: 10.1016/j.artint.2012.03.006) //The training data was sourced from the following repository: https://github.com/dice-group/FOX/tree/master/input/Wikiner //Configures the model storage to use the online repository backed by the local folder ./catalyst-models/ Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); //Create a new pipeline for the english language, and add the WikiNER model to it Console.WriteLine("Loading models... This might take a bit longer the first time you run this sample, as the models have to be downloaded from the online repository"); var nlp = await Pipeline.ForAsync(Language.English); nlp.Add(await AveragePerceptronEntityRecognizer.FromStoreAsync(language: Language.English, version: Version.Latest, tag: "WikiNER")); //For processing a single document, you can call nlp.ProcessSingle var doc = new Document(Data.Sample_1, Language.English); nlp.ProcessSingle(doc); //For processing a multiple documents in parallel (i.e. multithreading), you can call nlp.Process on an IEnumerable<IDocument> enumerable var docs = nlp.Process(MultipleDocuments()); //This will print all recognized entities. You can also see how the WikiNER model makes a mistake on recognizing Amazon as a location on Data.Sample_1 PrintDocumentEntities(doc); foreach (var d in docs) { PrintDocumentEntities(d); } //For correcting Entity Recognition mistakes, you can use the Neuralyzer class. //This class uses the Pattern Matching entity recognition class to perform "forget-entity" and "add-entity" //passes on the document, after it has been processed by all other proceses in the NLP pipeline var neuralizer = new Neuralyzer(Language.English, 0, "WikiNER-sample-fixes"); //Teach the Neuralyzer class to forget the match for a single token "Amazon" with entity type "Location" neuralizer.TeachForgetPattern("Location", "Amazon", mp => mp.Add(new PatternUnit(P.Single().WithToken("Amazon").WithEntityType("Location")))); //Teach the Neuralyzer class to add the entity type Organization for a match for the single token "Amazon" neuralizer.TeachAddPattern("Organization", "Amazon", mp => mp.Add(new PatternUnit(P.Single().WithToken("Amazon")))); //Add the Neuralyzer to the pipeline nlp.UseNeuralyzer(neuralizer); //Now you can see that "Amazon" is correctly recognized as the entity type "Organization" var doc2 = new Document(Data.Sample_1, Language.English); nlp.ProcessSingle(doc2); PrintDocumentEntities(doc2); }
private static async Task Main() { Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); // Catalyst currently supports 3 different types of models for Named Entity Recognition (NER): // - Gazetteer-like(i.e. [Spotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/Spotter.cs)) // - Regex-like(i.e. [PatternSpotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/PatternSpotter.cs)) // - Perceptron (i.e. [AveragePerceptronEntityRecognizer](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/AveragePerceptronEntityRecognizer.cs)) await DemonstrateAveragePerceptronEntityRecognizerAndPatternSpotter(); DemonstrateSpotter(); }
static async Task Main(string[] args) { Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); //Need to register the languages we want to use first Catalyst.Models.English.Register(); //Configures the model storage to use the local folder ./catalyst-models/ Storage.Current = new DiskStorage("catalyst-models"); //Download the Reuters corpus if necessary var(train, test) = await Corpus.Reuters.GetAsync(); //Parse the documents using the English pipeline, as the text data is untokenized so far var nlp = Pipeline.For(Language.English); var trainDocs = nlp.Process(train).ToArray(); var testDocs = nlp.Process(test).ToArray(); //Train an LDA topic model on the trainind dateset using (var lda = new LDA(Language.English, 0, "reuters-lda")) { lda.Data.NumberOfTopics = 20; //Arbitrary number of topics lda.Train(trainDocs, Environment.ProcessorCount); await lda.StoreAsync(); } using (var lda = await LDA.FromStoreAsync(Language.English, 0, "reuters-lda")) { foreach (var doc in testDocs) { if (lda.TryPredict(doc, out var topics)) { var docTopics = string.Join("\n", topics.Select(t => lda.TryDescribeTopic(t.TopicID, out var td) ? $"[{t.Score:n3}] => {td.ToString()}" : "")); Console.WriteLine("------------------------------------------"); Console.WriteLine(doc.Value); Console.WriteLine("------------------------------------------"); Console.WriteLine(docTopics); Console.WriteLine("------------------------------------------\n\n"); } } } }
private static async Task Main() { //Initialize the English built-in models Catalyst.Models.English.Register(); //This step is not necessary anymore as we are consuming the default English models from Nuget // Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); // Catalyst currently supports 3 different types of models for Named Entity Recognition (NER): // - Gazetteer-like(i.e. [Spotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/Spotter.cs)) // - Regex-like(i.e. [PatternSpotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/PatternSpotter.cs)) // - Perceptron (i.e. [AveragePerceptronEntityRecognizer](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/AveragePerceptronEntityRecognizer.cs)) SpotterSample(); await AveragePerceptronEntityRecognizerAndPatternSpotterSample(); }
static async Task Main(string[] args) { Microsoft.Build.Locator.MSBuildLocator.RegisterDefaults(); ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(l => l.SetMinimumLevel(LogLevel.Information) .AddZLoggerConsole(options => options.PrefixFormatter = (buf, info) => ZString.Utf8Format(buf, "[{0}] [{1:D2}:{2:D2}:{3:D2}] ", GetLogLevelString(info.LogLevel), info.Timestamp.LocalDateTime.Hour, info.Timestamp.LocalDateTime.Minute, info.Timestamp.LocalDateTime.Second)))); var settings = new H5DotJson_AssemblySettings(); var request = new CompilationRequest("App", settings) .NoPackageResources() .NoHTML() .WithPackageReference("h5", NuGetVersion.Parse("0.0.8537")) .WithPackageReference("h5.Core", NuGetVersion.Parse("0.0.8533")) .WithSourceFile("App.cs", @" using System; using H5; namespace Test { internal static class App { private static int HelloWorld; private static void Main() { Console.WriteLine(nameof(HelloWorld)); } } } "); var compiledJavascript = await CompilationProcessor.CompileAsync(request); foreach (var(file, code) in compiledJavascript.Output) { Logger.ZLogInformation("File: {0}\n\n----------------------------\n\n{1}\n\n----------------------------\n\n", file, code); } await Task.Delay(1000); //Awaits to print all log messages }
static async Task Main(string[] args) { ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(builder => builder.AddConsole())); ForceInvariantCultureAndUTF8Output(); await Parser.Default .ParseArguments <CommandLineOptions>(args) .MapResult( async options => { if (true || string.IsNullOrWhiteSpace(options.Token)) { Storage.Current = new DiskStorage(options.DiskStoragePath); } else { //For uploading on the online models repository Storage.Current = new OnlineWriteableRepositoryStorage(new DiskStorage(options.DiskStoragePath), options.Token); } Thread.CurrentThread.Priority = ThreadPriority.AboveNormal; ThreadPool.SetMinThreads(Environment.ProcessorCount * 2, Environment.ProcessorCount * 2); ThreadPool.SetMaxThreads(Environment.ProcessorCount * 20, Environment.ProcessorCount * 20); using (var p = Process.GetCurrentProcess()) { p.PriorityClass = ProcessPriorityClass.High; } await CreateProjectsIfNeeded(options.LanguagesDirectory); if (!string.IsNullOrWhiteSpace(options.HoldOffHungerData)) { await PrepareBritishToAmericanSpellings.RunAsync(options.HoldOffHungerData, options.LanguagesDirectory); } if (!string.IsNullOrWhiteSpace(options.SpacyLookupsData)) { await PrepareSpacyLookups.RunAsync(options.SpacyLookupsData, options.LanguagesDirectory); } if (!string.IsNullOrWhiteSpace(options.UniversalDependenciesPath)) { //await TrainSentenceDetector.Train(options.UniversalDependenciesPath, options.LanguagesDirectory); await TrainPOSTagger.Train(udSource: options.UniversalDependenciesPath, ontonotesSource: options.OntonotesPath, languagesDirectory: options.LanguagesDirectory); } return; if (!string.IsNullOrWhiteSpace(options.WikiNERPath)) { await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.English, 0, "WikiNER", options.LanguagesDirectory); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.French, 0, "WikiNER", options.LanguagesDirectory); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.German, 0, "WikiNER", options.LanguagesDirectory); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Spanish, 0, "WikiNER", options.LanguagesDirectory); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Italian, 0, "WikiNER", options.LanguagesDirectory); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Portuguese, 0, "WikiNER", options.LanguagesDirectory); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Russian, 0, "WikiNER", options.LanguagesDirectory); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Dutch, 0, "WikiNER", options.LanguagesDirectory); await TrainWikiNER.TrainAsync(options.WikiNERPath, Language.Polish, 0, "WikiNER", options.LanguagesDirectory); } if (!string.IsNullOrWhiteSpace(options.FastTextLanguageSentencesPath)) { TrainLanguageDetector.Train(options.FastTextLanguageSentencesPath); TrainLanguageDetector.Test(options.FastTextLanguageSentencesPath); } if (!string.IsNullOrWhiteSpace(options.LanguageJsonPath)) { TrainLanguageDetector.CreateLanguageDetector(options.LanguageJsonPath); } }, error => Task.CompletedTask); }
public static async Task Main(string[] args) { Console.OutputEncoding = Encoding.UTF8; ApplicationLogging.SetLoggerFactory(LoggerFactory.Create(lb => lb.AddConsole())); // Catalyst currently supports 3 different types of models for Named Entity Recognition (NER): // - Gazetteer-like(i.e. [Spotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/Spotter.cs)) // - Regex-like(i.e. [PatternSpotter](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/PatternSpotter.cs)) // - Perceptron (i.e. [AveragePerceptronEntityRecognizer](https://github.com/curiosity-ai/catalyst/blob/master/Catalyst/src/Models/EntityRecognition/AveragePerceptronEntityRecognizer.cs)) // For training an AveragePerceptronModel, check the source-code here: https://github.com/curiosity-ai/catalyst/blob/master/Catalyst.Training/src/TrainWikiNER.cs // This example uses the pre-trained WikiNER model, trained on the data provided by the paper "Learning multilingual named entity recognition from Wikipedia", Artificial Intelligence 194 (DOI: 10.1016/j.artint.2012.03.006) // The training data was sourced from the following repository: https://github.com/dice-group/FOX/tree/master/input/Wikiner //Configures the model storage to use the online repository backed by the local folder ./catalyst-models/ Storage.Current = new OnlineRepositoryStorage(new DiskStorage("catalyst-models")); //Create a new pipeline for the english language, and add the WikiNER model to it Console.WriteLine("Loading models... This might take a bit longer the first time you run this sample, as the models have to be downloaded from the online repository"); var nlp = await Pipeline.ForAsync(Language.English); nlp.Add(await AveragePerceptronEntityRecognizer.FromStoreAsync(language: Language.English, version: Version.Latest, tag: "WikiNER")); //Another available model for NER is the PatternSpotter, which is the conceptual equivalent of a RegEx on raw text, but operating on the tokenized form off the text. //Adds a custom pattern spotter for the pattern: single("is" / VERB) + multiple(NOUN/AUX/PROPN/AUX/DET/ADJ) var isApattern = new PatternSpotter(Language.English, 0, tag: "is-a-pattern", captureTag: "IsA"); isApattern.NewPattern("Is+Noun", mp => mp.Add(new PatternUnit(P.Single().WithToken("is").WithPOS(PartOfSpeech.VERB)), new PatternUnit(P.Multiple().WithPOS(PartOfSpeech.NOUN, PartOfSpeech.PROPN, PartOfSpeech.AUX, PartOfSpeech.DET, PartOfSpeech.ADJ)) )); nlp.Add(isApattern); //For processing a single document, you can call nlp.ProcessSingle var doc = new Document(Data.Sample_1, Language.English); nlp.ProcessSingle(doc); //For processing a multiple documents in parallel (i.e. multithreading), you can call nlp.Process on an IEnumerable<IDocument> enumerable var docs = nlp.Process(MultipleDocuments()); //This will print all recognized entities. You can also see how the WikiNER model makes a mistake on recognizing Amazon as a location on Data.Sample_1 PrintDocumentEntities(doc); foreach (var d in docs) { PrintDocumentEntities(d); } //For correcting Entity Recognition mistakes, you can use the Neuralyzer class. //This class uses the Pattern Matching entity recognition class to perform "forget-entity" and "add-entity" //passes on the document, after it has been processed by all other proceses in the NLP pipeline var neuralizer = new Neuralyzer(Language.English, 0, "WikiNER-sample-fixes"); //Teach the Neuralyzer class to forget the match for a single token "Amazon" with entity type "Location" neuralizer.TeachForgetPattern("Location", "Amazon", mp => mp.Add(new PatternUnit(P.Single().WithToken("Amazon").WithEntityType("Location")))); //Teach the Neuralyzer class to add the entity type Organization for a match for the single token "Amazon" neuralizer.TeachAddPattern("Organization", "Amazon", mp => mp.Add(new PatternUnit(P.Single().WithToken("Amazon")))); //Add the Neuralyzer to the pipeline nlp.UseNeuralyzer(neuralizer); //Now you can see that "Amazon" is correctly recognized as the entity type "Organization" var doc2 = new Document(Data.Sample_1, Language.English); nlp.ProcessSingle(doc2); PrintDocumentEntities(doc2); //Another way to perform entity recognition is to use a gazeteer-like model. For example, here is one for capturing a set of programing languages var spotter = new Spotter(Language.Any, 0, "programming", "ProgrammingLanguage"); spotter.Data.IgnoreCase = true; //In some cases, it might be better to set it to false, and only add upper/lower-case exceptions as required spotter.AddEntry("C#"); spotter.AddEntry("Python"); spotter.AddEntry("Python 3"); //entries can have more than one word, and will be automatically tokenized on whitespace spotter.AddEntry("C++"); spotter.AddEntry("Rust"); spotter.AddEntry("Java"); var nlp2 = Pipeline.TokenizerFor(Language.English); nlp2.Add(spotter); //When adding a spotter model, the model propagates any exceptions on tokenization to the pipeline's tokenizer var docAboutProgramming = new Document(Data.SampleProgramming, Language.English); nlp.ProcessSingle(docAboutProgramming); PrintDocumentEntities(docAboutProgramming); }