public Tokenizer(TokenizerConfig config)
        {
            _SentSplitter          = new SentSplitter(config.SentSplitterConfig);
            _Words                 = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _ParticleThatExclusion = config.Model.ParticleThatExclusion;
            _SentSplitterProcessSentCallback_Delegate = new SentSplitter.ProcessSentCallbackDelegate(SentSplitterProcessSentCallback);

            _UIM  = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;   //UnsafeConst.Inst._CRF_CHARTYPE_MAP;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            if ((config.TokenizeMode & TokenizeMode.PosTagger) == TokenizeMode.PosTagger)
            {
                _PosTaggerInputTypeProcessor = config.PosTaggerInputTypeProcessorFactory.CreateInstance();
                //_Make_PosTagger = true;
            }
            else
            {
                _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance;
            }

            if ((config.TokenizeMode & TokenizeMode.Ner) == TokenizeMode.Ner)
            {
                _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance();
                //_Make_Ner = true;
            }
            else
            {
                _NerInputTypeProcessor = Dummy_NerInputTypeProcessor.Instance;
            }
        }
Beispiel #2
0
        public PosTaggerModelBuilder(string templateFilename, LanguageTypeEnum languageType, UrlDetectorConfig urlDetectorConfig)
        {
            templateFilename.ThrowIfNullOrWhiteSpace("templateFilename");
            urlDetectorConfig.ThrowIfNull("urlDetectorConfig");

            _PosTaggerScriber            = PosTaggerScriber.Create4ModelBuilder(templateFilename);
            _PosTaggerInputTypeProcessor = CreatePosTaggerInputTypeProcessor(languageType);
            _UrlDetector = new UrlDetector(urlDetectorConfig);
            _Words       = new List <word_t>();
        }
        internal PosTaggerInputTypeProcessorFactory(PosTaggerResourcesModel model, LanguageTypeEnum languageType)
        {
            switch (languageType)
            {
            case LanguageTypeEnum.Ru:
                _PosTaggerInputTypeProcessor = new PosTaggerInputTypeProcessor_Ru(model.Numbers, model.Abbreviations);
                break;

            case LanguageTypeEnum.En:
                _PosTaggerInputTypeProcessor = new PosTaggerInputTypeProcessor_En(model.Numbers, model.Abbreviations);
                break;

            default:
                throw (new ArgumentException(languageType.ToString()));
            }
        }
Beispiel #4
0
        private Tokenizer(TokenizerConfig4NerModelBuilder config)
        {
            config.UrlDetectorConfig.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position;

            _urlDetector        = new UrlDetector(config.UrlDetectorConfig);
            _buildModelSentence = Sentence.CreateEmpty();
            _words                 = new List <Word>(DEFAULT_WORDSLIST_CAPACITY);
            _buildModelWords       = new List <Buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _particleThatExclusion = config.Model.ParticleThatExclusion;

            _UIM  = XlatUnsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = XlatUnsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;

            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            _posTaggerInputTypeProcessor = DummyPosTaggerInputTypeProcessor.Instance;
            _nerInputTypeProcessor       = config.NerInputTypeProcessorFactory.CreateInstance();
        }
        private Tokenizer(TokenizerConfig4NerModelBuilder config)
        {
            _UrlDetector = new UrlDetector(new UrlDetectorConfig()
            {
                Model = config.UrlDetectorConfig.Model, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position
            });
            _BuildModelSent        = sent_t.CreateEmpty();
            _Words                 = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _BuildModelWords       = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _ParticleThatExclusion = config.Model.ParticleThatExclusion;

            _UIM  = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;   //UnsafeConst.Inst._CRF_CHARTYPE_MAP;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance;
            _NerInputTypeProcessor       = config.NerInputTypeProcessorFactory.CreateInstance();
        }