public List<string> Tokenize()
        {
            var model = new TokenizerModel(File.OpenRead(BaseFolder + "en-token.bin"));
            var tokenizer = new TokenizerME(model);

            return tokenizer.Tokenize(this.Text).ToList();
        }
Esempio n. 2
0
        /// <summary>
        /// Initializes a new instance of the <see cref="TokenizerME"/> class.
        /// </summary>
        /// <param name="model">The tokenizer model.</param>
        public TokenizerME(TokenizerModel model) {
            this.model = model.MaxentModel;

            var factory = model.Factory;

            alphanumeric = new Regex(factory.AlphaNumericPattern, RegexOptions.Compiled);
            cg = factory.ContextGenerator;
            useAlphaNumericOptimization = model.UseAlphaNumericOptimization;

            newTokens = new List<Span>();
            tokProbs = new List<double>(50);
        }
Esempio n. 3
0
        /// <summary>
        /// Initializes a new instance of the <see cref="TokenizerME"/> class.
        /// </summary>
        /// <param name="model">The tokenizer model.</param>
        public TokenizerME(TokenizerModel model)
        {
            this.model = model.MaxentModel;

            var factory = model.Factory;

            alphanumeric = new Regex(factory.AlphaNumericPattern, RegexOptions.Compiled);
            cg           = factory.ContextGenerator;
            useAlphaNumericOptimization = model.UseAlphaNumericOptimization;

            newTokens = new List <Span>();
            tokProbs  = new List <double>(50);
        }
        public void TestDefault() {
            var dict = LoadAbbDictionary();
            const string lang = "es";

            var model = Train(new TokenizerFactory(lang, dict, false, null));

            var factory = model.Factory;

            Assert.IsInstanceOf(typeof (Dict), factory.AbbreviationDictionary);
            Assert.IsInstanceOf(typeof (DefaultTokenContextGenerator), factory.ContextGenerator);

            Assert.AreEqual(Factory.DefaultAlphanumeric, factory.AlphaNumericPattern);
            Assert.AreEqual(lang, factory.LanguageCode);
            Assert.AreEqual(lang, model.Language);

            Assert.AreEqual(false, factory.UseAlphaNumericOptimization);

            using (var data = new MemoryStream()) {
                model.Serialize(new UnclosableStream(data));

                data.Seek(0, SeekOrigin.Begin);

                var fromSerialized = new TokenizerModel(data);

                factory = fromSerialized.Factory;

                Assert.IsInstanceOf(typeof (Dict), factory.AbbreviationDictionary);
                Assert.IsInstanceOf(typeof (DefaultTokenContextGenerator), factory.ContextGenerator);

                Assert.AreEqual(Factory.DefaultAlphanumeric, factory.AlphaNumericPattern);
                Assert.AreEqual(lang, factory.LanguageCode);
                Assert.AreEqual(lang, fromSerialized.Language);

                Assert.AreEqual(false, factory.UseAlphaNumericOptimization);
            }
        }
        public void TestDummyFactory() {

            const string lang = "es";
            const string pattern = "^[0-9A-Za-z]+$";

            var dic = LoadAbbDictionary();

            var model = Train(new DummyTokenizerFactory(lang, dic, true, pattern));

            Assert.IsInstanceOf(typeof(DummyTokenizerFactory), model.Factory);

            var factory = model.Factory;

            Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyDictionary), factory.AbbreviationDictionary);
            Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyContextGenerator), factory.ContextGenerator);

            Assert.AreEqual(pattern, factory.AlphaNumericPattern);
            Assert.AreEqual(lang, factory.LanguageCode);
            Assert.AreEqual(lang, model.Language);
            Assert.AreEqual(true, factory.UseAlphaNumericOptimization);

            using (var data = new MemoryStream()) {
                model.Serialize(new UnclosableStream(data));

                data.Seek(0, SeekOrigin.Begin);

                var fromSerialized = new TokenizerModel(data);

                Assert.IsInstanceOf(typeof(DummyTokenizerFactory), fromSerialized.Factory);

                factory = fromSerialized.Factory;

                Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyDictionary), factory.AbbreviationDictionary);
                Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyContextGenerator), factory.ContextGenerator);

                Assert.AreEqual(pattern, factory.AlphaNumericPattern);
                Assert.AreEqual(lang, factory.LanguageCode);
                Assert.AreEqual(lang, fromSerialized.Language);
                Assert.AreEqual(true, factory.UseAlphaNumericOptimization);
            }
        }