/// <summary> /// Trains a parser model with the given parameters. /// </summary> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="rules">The head rules.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <returns>The trained <see cref="ParserModel" /> object.</returns> public static ParserModel Train( Monitor monitor, string languageCode, IObjectStream <Parse> samples, AbstractHeadRules rules, TrainingParameters parameters) { var dict = BuildDictionary(samples, rules, parameters); samples.Reset(); var manifestInfoEntries = new Dictionary <string, string>(); // build //System.err.println("Training builder"); var bes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Build, dict); var buildReportMap = new Dictionary <string, string>(); var buildTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("build"), buildReportMap, monitor); var buildModel = buildTrainer.Train(bes); MergeReportIntoManifest(manifestInfoEntries, buildReportMap, "build"); samples.Reset(); // tag var posTaggerParams = parameters.GetNamespace("tagger"); if (!posTaggerParams.Contains(Parameters.BeamSize)) { posTaggerParams.Set(Parameters.BeamSize, "10"); } var posModel = POSTaggerME.Train(languageCode, new PosSampleStream(samples), parameters.GetNamespace("tagger"), new POSTaggerFactory()); samples.Reset(); // chunk var chunkModel = ChunkerME.Train(languageCode, new ChunkSampleStream(samples), parameters.GetNamespace("chunker"), new ParserChunkerFactory()); samples.Reset(); // check //System.err.println("Training checker"); var kes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Check); var checkReportMap = new Dictionary <string, string>(); var checkTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("check"), checkReportMap, monitor); var checkModel = checkTrainer.Train(kes); MergeReportIntoManifest(manifestInfoEntries, checkReportMap, "check"); return(new ParserModel(languageCode, buildModel, checkModel, posModel, chunkModel, rules, manifestInfoEntries)); }
public ILexer InitNow() { Console.WriteLine("Loading..."); _tokenizer = prepareTokenizer(); _posTagger = preparePOSTagger(); return this; }
public List <Tag> GetTags(string paragraph) { var bin = GetFileStream("en-pos-maxent.bin"); POSModel model = new POSModel(bin); POSTagger tagger = new POSTaggerME(model); var sentenceSpans = SentPosDetect(paragraph); List <Tag> tagsResult = new List <Tag>(); foreach (var sentenceSpan in sentenceSpans) { var sentence = sentenceSpan.getCoveredText(paragraph).toString(); var start = sentenceSpan.getStart(); var end = sentenceSpan.getEnd(); var tokenSpans = GetTokens(sentence); var tokens = new string[tokenSpans.Length]; for (var i = 0; i < tokens.Length; i++) { tokens[i] = tokenSpans[i].getCoveredText(sentence).toString(); var tag = tagger.tag(new[] { tokenSpans[i].getCoveredText(sentence).toString() }).FirstOrDefault(); tagsResult.Add(new Tag { startIndex = start, endIndex = end, category = tag }); } } return(tagsResult); }
public static POSModel TrainModel(string path, ModelType mt) { FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read); WordTagSampleStream stream = new WordTagSampleStream(fs); TrainingParameters trainParams = new TrainingParameters(); trainParams.Set(Parameters.Iterations, "100"); trainParams.Set(Parameters.Cutoff, "0"); switch (mt) { case ModelType.Maxent: trainParams.Set(Parameters.Algorithm, "MAXENT"); break; case ModelType.Perceptron: trainParams.Set(Parameters.Algorithm, "PERCEPTRON"); break; default: throw new NotSupportedException(); } return(POSTaggerME.Train(TRAINING_LANGUAGE, stream, trainParams, new POSTaggerFactory())); }
public PartOfSpeechRecognizer() { POSModel posModel; using (var modelFile = new FileStream("en-pos-maxent.bin", FileMode.Open)) posModel = new POSModel(modelFile); PosTagger = new POSTaggerME(posModel); }
public DeterminerPartOfSpeech() { POSModel posModel; using (var modelFile = new FileStream("en-pos-maxent.bin", FileMode.Open)) posModel = new POSModel(modelFile); posTagger = new POSTaggerME(posModel); }
public void TestBuildNGramDictionary() { var samples = CreateSampleStream(); var d = POSTaggerME.BuildNGramDictionary(samples, 0); Assert.NotNull(d); }
private void LoadTagger() { if (!alreadyLoadTokenizer) { java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-pos-maxent.bin"); POSModel posModel = new POSModel(modelInpStream); tagger = new POSTaggerME(posModel); alreadyLoadTokenizer = true; } }
string[] POSTagger(string[] tokens) { InputStream modelIn = new FileInputStream(modelPath + "en-pos-maxent.zip"); POSModel model = new POSModel(modelIn); POSTaggerME tagger = new POSTaggerME(model); string[] tags = tagger.tag(tokens); //int i = 0; //foreach (string s in tags) //{ // System.Console.WriteLine("{0} : {1}", tokens[i], s); // debug.Print(tokens[i] + " : " + s + "\n"); // i++; //} return(tags); }
public void TestPosTagger() { var posModel = TrainPosModel(); var tagger = new POSTaggerME(posModel); var tags = tagger.Tag(new[] { "The", "driver", "got", "badly", "injured", "." }); Assert.AreEqual(6, tags.Length); Assert.AreEqual("DT", tags[0]); Assert.AreEqual("NN", tags[1]); Assert.AreEqual("VBD", tags[2]); Assert.AreEqual("RB", tags[3]); Assert.AreEqual("VBN", tags[4]); Assert.AreEqual(".", tags[5]); }
private void button1_Click(object sender, EventArgs e) { InputStream modelIn = new FileInputStream("en-pos-maxent.bin"); POSModel model = new POSModel(modelIn); // initialize POSTaggerME POSTaggerME tagger = new POSTaggerME(model); words = textBox1.Text.Split(); String[] result = tagger.tag(words); label1.Text = ""; for (int i = 0; i < result.Length; i++) { label1.Text += result[i] + ", "; } }
public SentenceProcessor(String[] words) { this.words = new String[words.Length]; tempWords = new String[words.Length]; POS_Tags = new String[words.Length]; for (int i = 0; i < words.Length; i++) { this.words[i] = words[i]; tempWords[i] = words[i]; } modelIn = new FileInputStream("en-pos-maxent.bin"); model = new POSModel(modelIn); tagger = new POSTaggerME(model); POS_Tags = getPOS_Tags(); }
private void LoadModels() { POSModel posModel; using (var modelFile = new FileStream(Path.Combine(configuration.Resources, configuration.NlpModels, "en-pos-maxent.bin"), FileMode.Open, FileAccess.Read, FileShare.Read)) { posModel = new POSModel(modelFile); } ChunkerModel chunkerModel; using (var modelFile = new FileStream(Path.Combine(configuration.Resources, configuration.NlpModels, "en-chunker.bin"), FileMode.Open, FileAccess.Read, FileShare.Read)) { chunkerModel = new ChunkerModel(modelFile); } posTagger = new POSTaggerME(posModel); chunker = new ChunkerME(chunkerModel); }
private void LoadModels(string resourcesFolder) { POSModel posModel; using (var modelFile = new FileStream(Path.Combine(resourcesFolder, @"1.5/en-pos-maxent.bin"), FileMode.Open, FileAccess.Read, FileShare.Read)) { posModel = new POSModel(modelFile); } ChunkerModel chunkerModel; using (var modelFile = new FileStream(Path.Combine(resourcesFolder, @"1.5/en-chunker.bin"), FileMode.Open, FileAccess.Read, FileShare.Read)) { chunkerModel = new ChunkerModel(modelFile); } posTagger = new POSTaggerME(posModel); chunker = new ChunkerME(chunkerModel); }
public override void run(string[] args) { if (args.Length != 1) { Console.WriteLine(Help); } else { POSModel model = (new POSModelLoader()).load(new File(args[0])); POSTaggerME tagger = new POSTaggerME(model); ObjectStream <string> lineStream = new PlainTextByLineStream(new InputStreamReader(Console.OpenStandardInput)); PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent"); perfMon.start(); try { string line; while ((line = lineStream.read()) != null) { string[] whitespaceTokenizerLine = WhitespaceTokenizer.INSTANCE.tokenize(line); string[] tags = tagger.tag(whitespaceTokenizerLine); POSSample sample = new POSSample(whitespaceTokenizerLine, tags); Console.WriteLine(sample.ToString()); perfMon.incrementCounter(); } } catch (IOException e) { CmdLineUtil.handleStdinIoError(e); } perfMon.stopAndPrintFinalResult(); } }
public NLP() { //loading sentence detector model java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-sent.bin"); SentenceModel sentenceModel = new SentenceModel(modelInpStream); sentenceDetector = new SentenceDetectorME(sentenceModel); //loading tokenizer model modelInpStream = new java.io.FileInputStream("Resources\\en-token.bin"); TokenizerModel tokenizerModel = new TokenizerModel(modelInpStream); tokenizer = new TokenizerME(tokenizerModel); modelInpStream = new java.io.FileInputStream("Resources\\en-pos-maxent.bin"); POSModel posModel = new POSModel(modelInpStream); tagger = new POSTaggerME(posModel); modelInpStream = new java.io.FileInputStream("Resources\\en-chunker.bin"); ChunkerModel chunkerModel = new ChunkerModel(modelInpStream); chunker = new ChunkerME(chunkerModel); modelInpStream = new java.io.FileInputStream("Resources\\en-parser-chunking.bin"); ParserModel parserModel = new ParserModel(modelInpStream); parser = ParserFactory.create(parserModel); //loading stop words list StreamReader sr = new StreamReader("Resources\\english.stop.txt"); string line; while ((line = sr.ReadLine()) != null) { stopwords.Add(Stemming(line)); stopwords.Add(line); } }
public static IEnumerable<IEnumerable<ChunkItem>> GetChunks(IEnumerable<string> Sentences) { var posModelStream = new java.io.ByteArrayInputStream(Resource.en_pos_maxent);//new java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-pos-maxent.bin"); var posModel = new POSModel(posModelStream); var pos = new POSTaggerME(posModel); var modelStream = new java.io.ByteArrayInputStream(Resource.en_token); //java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-token.bin"); var model = new TokenizerModel(modelStream); var tokenizer = new TokenizerME(model); var chunkerModelStream = new java.io.ByteArrayInputStream(Resource.en_chunker); var chunkerModel = new ChunkerModel(chunkerModelStream); var chunker = new ChunkerME(chunkerModel); return Sentences.Select(p => { var tokens = tokenizer.tokenize(p); var tags = pos.tag(tokens); var chunks = chunker.chunk(tokens, tags); var res = new List<ChunkItem>(); for (var i = 0; i < chunks.Length; i++) { res.Add(new ChunkItem { token = tokens[i], tag = tags[i], chunk = chunks[i] }); } return res; }); }
internal static POSModel TrainPosModel(ModelType type = ModelType.Maxent) { var p = new TrainingParameters(); switch (type) { case ModelType.Maxent: p.Set(Parameters.Algorithm, "MAXENT"); break; case ModelType.Perceptron: p.Set(Parameters.Algorithm, "PERCEPTRON"); break; default: throw new NotSupportedException(); } p.Set(Parameters.Iterations, "100"); p.Set(Parameters.Cutoff, "5"); return(POSTaggerME.Train("en", CreateSampleStream(), p, new POSTaggerFactory())); }
public override void run(string format, string[] args) { base.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, true); if (mlParams != null && !TrainUtil.isValid(mlParams.Settings)) { throw new TerminateToolException(1, "Training parameters file '" + @params.Params + "' is invalid!"); } if (mlParams == null) { mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value); mlParams.put(TrainingParameters.ALGORITHM_PARAM, getModelType(@params.Type).ToString()); } File modelOutFile = @params.Model; CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile); Dictionary ngramDict = null; int?ngramCutoff = @params.Ngram; if (ngramCutoff != null) { Console.Error.Write("Building ngram dictionary ... "); try { ngramDict = POSTaggerME.buildNGramDictionary(sampleStream, ngramCutoff.Value); sampleStream.reset(); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while building NGram Dictionary: " + e.Message, e); } Console.Error.WriteLine("done"); } POSTaggerFactory postaggerFactory = null; try { postaggerFactory = POSTaggerFactory.create(@params.Factory, ngramDict, null); } catch (InvalidFormatException e) { throw new TerminateToolException(-1, e.Message, e); } if (@params.Dict != null) { try { postaggerFactory.TagDictionary = postaggerFactory.createTagDictionary(@params.Dict); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while loading POS Dictionary: " + e.Message, e); } } if (@params.TagDictCutoff != null) { try { TagDictionary dict = postaggerFactory.TagDictionary; if (dict == null) { dict = postaggerFactory.createEmptyTagDictionary(); postaggerFactory.TagDictionary = dict; } if (dict is MutableTagDictionary) { POSTaggerME.populatePOSDictionary(sampleStream, (MutableTagDictionary)dict, @params.TagDictCutoff.Value); } else { throw new System.ArgumentException("Can't extend a POSDictionary that does not implement MutableTagDictionary."); } sampleStream.reset(); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while creating/extending POS Dictionary: " + e.Message, e); } } POSModel model; try { model = POSTaggerME.train(@params.Lang, sampleStream, mlParams, postaggerFactory); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } CmdLineUtil.writeModel("pos tagger", modelOutFile, model); }
public POSTagger(ModelType type = ModelType.Maxent) { this.tagger = new POSTaggerME(TrainModel(Environment.CurrentDirectory + TRAINING_MODEL_PATH, type)); }
public POSTagger(POSModel model) { this.tagger = new POSTaggerME(model); }
public POSTagger(FileStream modelStream) { POSModel model = new POSModel(modelStream); this.tagger = new POSTaggerME(model); }
/// <summary> /// Trains a parser model with the given parameters. /// </summary> /// <param name="languageCode">The language code.</param> /// <param name="samples">The data samples.</param> /// <param name="rules">The head rules.</param> /// <param name="parameters">The machine learnable parameters.</param> /// <param name="monitor"> /// A evaluation monitor that can be used to listen the messages during the training or it can cancel the training operation. /// This argument can be a <c>null</c> value. /// </param> /// <returns>The trained <see cref="ParserModel"/> object.</returns> /// <exception cref="System.NotSupportedException">Trainer type is not supported.</exception> public static ParserModel Train( string languageCode, IObjectStream <Parse> samples, AbstractHeadRules rules, TrainingParameters parameters, Monitor monitor) { var manifestInfoEntries = new Dictionary <string, string>(); System.Diagnostics.Debug.Print("Building dictionary"); var dictionary = BuildDictionary(samples, rules, parameters); samples.Reset(); // tag var posModel = POSTaggerME.Train( languageCode, new PosSampleStream(samples), parameters.GetNamespace("tagger"), new POSTaggerFactory(), monitor); samples.Reset(); // chunk var chunkModel = ChunkerME.Train( languageCode, new ChunkSampleStream(samples), parameters.GetNamespace("chunker"), new ChunkerFactory(), monitor); samples.Reset(); // build System.Diagnostics.Debug.Print("Training builder"); var bes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Build, dictionary); var buildReportMap = new Dictionary <string, string>(); var buildTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("build"), buildReportMap, monitor); var buildModel = buildTrainer.Train(bes); Chunking.Parser.MergeReportIntoManifest(manifestInfoEntries, buildReportMap, "build"); samples.Reset(); // check System.Diagnostics.Debug.Print("Training checker"); var kes = new ParserEventStream(samples, rules, ParserEventTypeEnum.Check); var checkReportMap = new Dictionary <string, string>(); var checkTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("check"), checkReportMap, monitor); var checkModel = checkTrainer.Train(kes); Chunking.Parser.MergeReportIntoManifest(manifestInfoEntries, checkReportMap, "check"); samples.Reset(); // attach System.Diagnostics.Debug.Print("Training attacher"); var attachEvents = new ParserEventStream(samples, rules, ParserEventTypeEnum.Attach); var attachReportMap = new Dictionary <string, string>(); var attachTrainer = TrainerFactory.GetEventTrainer(parameters.GetNamespace("attach"), attachReportMap, monitor); var attachModel = attachTrainer.Train(attachEvents); Chunking.Parser.MergeReportIntoManifest(manifestInfoEntries, attachReportMap, "attach"); return(new ParserModel( languageCode, buildModel, checkModel, attachModel, posModel, chunkModel, rules, ParserType.TreeInsert, manifestInfoEntries)); }
// Constructors and finalizers: private Repository() { _assemblyName = Regex.Match(_assemblyFullName, "^(.*?),.*$").Result("$1"); _rootDrive = ("/usr/project/xtmp/dp195/Poetix18/").Replace(@"\", Dsc); _nlpFolder = ("rhetorica/nlp/").Replace(@"\", Dsc); _openNlpModelsFolder = ("OpenNLP/models/").Replace(@"\", Dsc); _openNlpModelsPath = RootDrive + _nlpFolder + _openNlpModelsFolder; _wordNetFolder = ("WordNet_3/").Replace(@"\", Dsc); _wordNetPath = RootDrive + _nlpFolder + _wordNetFolder; _grammarFolder = ("StanfordParser/grammar/").Replace(@"\", Dsc); _grammarPath = RootDrive + _nlpFolder + _grammarFolder; _dataFolder = ("data/").Replace(@"\", Dsc); _nlpTextsPath = RootDrive + _dataFolder; string[] localTextDirectoryParts = { CurrentAssemblyDirectoryPath, "..", "..","..", "data" //"..", "..", "text" }; _localTextPath = Path.Combine(localTextDirectoryParts) + "/"; // For development use // WordNet engine: Console.Write("Loading WordNet engine.... "); _wordNetEngine = new WordNetEngine(WordNetPath, true); Console.WriteLine("Done."); // OpenNLP sentence detector: Console.Write("Loading OpenNLP sentence detector.... "); java.io.FileInputStream modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-sent.bin"); _sentenceModel = new SentenceModel(modelInputStream); modelInputStream.close(); _sentenceDetector = new SentenceDetectorME(_sentenceModel); Console.WriteLine("Done."); // OpenNLP tokenizer: Console.Write("Loading OpenNLP tokenizer.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-token.bin"); _tokenizerModel = new opennlp.tools.tokenize.TokenizerModel(modelInputStream); modelInputStream.close(); _tokenizer = new opennlp.tools.tokenize.TokenizerME(_tokenizerModel); Console.WriteLine("Done."); // OpenNLP name finder: Console.Write("Loading OpenNLP name finder.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-ner-person.bin"); _tokenNameFinderModel = new TokenNameFinderModel(modelInputStream); modelInputStream.close(); _nameFinder = new NameFinderME(_tokenNameFinderModel); Console.WriteLine("Done."); // OpenNLP POS tagger: Console.Write("Loading OpenNLP POS tagger.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-pos-maxent.bin"); _posModel = new POSModel(modelInputStream); modelInputStream.close(); _tagger = new POSTaggerME(_posModel); Console.WriteLine("Done."); // OpenNLP chunker: Console.Write("Loading OpenNLP chunker.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-chunker.bin"); _chunkerModel = new ChunkerModel(modelInputStream); modelInputStream.close(); _chunker = new ChunkerME(_chunkerModel); Console.WriteLine("Done."); // OpenNLP parser: if (_loadParser) { Console.Write("Loading OpenNLP parser.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-parser-chunking.bin"); _parserModel = new ParserModel(modelInputStream); modelInputStream.close(); _parser = ParserFactory.create(_parserModel); Console.WriteLine("Done."); } // Stanford parser: //_stanfordParser = new LexicalizedParser(GrammarPath + "englishPCFG.ser.gz"); // Obsolete method _stanfordParser = LexicalizedParser.loadModel(GrammarPath + "englishPCFG.ser.gz"); // Porter stemmer: _porterStemmer = new PorterStemmer(); }