public static TokenizerModel CreateMaxentTokenModel() { using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) { var samples = new TokenSampleStream(new PlainTextByLineStream(data)); var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); return TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams); } }
public static TokenizerModel CreateMaxentTokenModel() { using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) { var samples = new TokenSampleStream(new PlainTextByLineStream(data)); var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); return(TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams)); } }
public static TokenizerModel TrainModel(string path) { FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read); TokenSampleStream stream = new TokenSampleStream(new PlainTextByLineStream(fs)); TrainingParameters trainParams = new TrainingParameters(); trainParams.Set(Parameters.Iterations, "100"); trainParams.Set(Parameters.Cutoff, "0"); return(TokenizerME.Train(stream, new TokenizerFactory(TRAINING_LANGUAGE, null, true), trainParams)); }
public void TestCrossCompatibility() { using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) { var samples = new TokenSampleStream(new PlainTextByLineStream(data)); var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams); var sMe = new TokenizerME(model); TokenizerMETest.TestTokenizer(sMe); var sProbs = sMe.TokenProbabilities; // --- java \/ var sFile = Path.GetTempFileName(); model.Serialize(new FileStream(sFile, FileMode.Create)); var jModel = new opennlp.tools.tokenize.TokenizerModel( OpenNLP.CreateInputStream(sFile) ); var jMe = new opennlp.tools.tokenize.TokenizerME(jModel); TestJavaTokenizer(jMe); var jProbs = jMe.getTokenProbabilities(); Assert.AreEqual(jProbs.Length, sProbs.Length); for (int i = 0; i < jProbs.Length; i++) { // one difference :( // -0.00000000000000011102230246251565 // // but still "insignificant" :) Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d); } } }
public void TestCrossCompatibility() { using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) { var samples = new TokenSampleStream(new PlainTextByLineStream(data)); var mlParams = new TrainingParameters(); mlParams.Set(Parameters.Iterations, "100"); mlParams.Set(Parameters.Cutoff, "0"); var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams); var sMe = new TokenizerME(model); TokenizerMETest.TestTokenizer(sMe); var sProbs = sMe.TokenProbabilities; // --- java \/ var sFile = Path.GetTempFileName(); model.Serialize(new FileStream(sFile, FileMode.Create)); var jModel = new opennlp.tools.tokenize.TokenizerModel( OpenNLP.CreateInputStream(sFile) ); var jMe = new opennlp.tools.tokenize.TokenizerME(jModel); TestJavaTokenizer(jMe); var jProbs = jMe.getTokenProbabilities(); Assert.AreEqual(jProbs.Length, sProbs.Length); for (int i = 0; i < jProbs.Length; i++) { // one difference :( // -0.00000000000000011102230246251565 // // but still "insignificant" :) Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d); } } }