public List<string> Tokenize() { var model = new TokenizerModel(File.OpenRead(BaseFolder + "en-token.bin")); var tokenizer = new TokenizerME(model); return tokenizer.Tokenize(this.Text).ToList(); }
/// <summary> /// Initializes a new instance of the <see cref="TokenizerME"/> class. /// </summary> /// <param name="model">The tokenizer model.</param> public TokenizerME(TokenizerModel model) { this.model = model.MaxentModel; var factory = model.Factory; alphanumeric = new Regex(factory.AlphaNumericPattern, RegexOptions.Compiled); cg = factory.ContextGenerator; useAlphaNumericOptimization = model.UseAlphaNumericOptimization; newTokens = new List<Span>(); tokProbs = new List<double>(50); }
/// <summary> /// Initializes a new instance of the <see cref="TokenizerME"/> class. /// </summary> /// <param name="model">The tokenizer model.</param> public TokenizerME(TokenizerModel model) { this.model = model.MaxentModel; var factory = model.Factory; alphanumeric = new Regex(factory.AlphaNumericPattern, RegexOptions.Compiled); cg = factory.ContextGenerator; useAlphaNumericOptimization = model.UseAlphaNumericOptimization; newTokens = new List <Span>(); tokProbs = new List <double>(50); }
public void TestDefault() { var dict = LoadAbbDictionary(); const string lang = "es"; var model = Train(new TokenizerFactory(lang, dict, false, null)); var factory = model.Factory; Assert.IsInstanceOf(typeof (Dict), factory.AbbreviationDictionary); Assert.IsInstanceOf(typeof (DefaultTokenContextGenerator), factory.ContextGenerator); Assert.AreEqual(Factory.DefaultAlphanumeric, factory.AlphaNumericPattern); Assert.AreEqual(lang, factory.LanguageCode); Assert.AreEqual(lang, model.Language); Assert.AreEqual(false, factory.UseAlphaNumericOptimization); using (var data = new MemoryStream()) { model.Serialize(new UnclosableStream(data)); data.Seek(0, SeekOrigin.Begin); var fromSerialized = new TokenizerModel(data); factory = fromSerialized.Factory; Assert.IsInstanceOf(typeof (Dict), factory.AbbreviationDictionary); Assert.IsInstanceOf(typeof (DefaultTokenContextGenerator), factory.ContextGenerator); Assert.AreEqual(Factory.DefaultAlphanumeric, factory.AlphaNumericPattern); Assert.AreEqual(lang, factory.LanguageCode); Assert.AreEqual(lang, fromSerialized.Language); Assert.AreEqual(false, factory.UseAlphaNumericOptimization); } }
public void TestDummyFactory() { const string lang = "es"; const string pattern = "^[0-9A-Za-z]+$"; var dic = LoadAbbDictionary(); var model = Train(new DummyTokenizerFactory(lang, dic, true, pattern)); Assert.IsInstanceOf(typeof(DummyTokenizerFactory), model.Factory); var factory = model.Factory; Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyDictionary), factory.AbbreviationDictionary); Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyContextGenerator), factory.ContextGenerator); Assert.AreEqual(pattern, factory.AlphaNumericPattern); Assert.AreEqual(lang, factory.LanguageCode); Assert.AreEqual(lang, model.Language); Assert.AreEqual(true, factory.UseAlphaNumericOptimization); using (var data = new MemoryStream()) { model.Serialize(new UnclosableStream(data)); data.Seek(0, SeekOrigin.Begin); var fromSerialized = new TokenizerModel(data); Assert.IsInstanceOf(typeof(DummyTokenizerFactory), fromSerialized.Factory); factory = fromSerialized.Factory; Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyDictionary), factory.AbbreviationDictionary); Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyContextGenerator), factory.ContextGenerator); Assert.AreEqual(pattern, factory.AlphaNumericPattern); Assert.AreEqual(lang, factory.LanguageCode); Assert.AreEqual(lang, fromSerialized.Language); Assert.AreEqual(true, factory.UseAlphaNumericOptimization); } }