Example #1
0
        public Tokenizer(TokenizerConfig config)
        {
            _SentSplitter          = new SentSplitter(config.SentSplitterConfig);
            _Words                 = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _ParticleThatExclusion = config.Model.ParticleThatExclusion;
            _SentSplitterProcessSentCallback_Delegate = new SentSplitter.ProcessSentCallbackDelegate(SentSplitterProcessSentCallback);

            _UIM  = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;   //UnsafeConst.Inst._CRF_CHARTYPE_MAP;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            if ((config.TokenizeMode & TokenizeMode.PosTagger) == TokenizeMode.PosTagger)
            {
                _PosTaggerInputTypeProcessor = config.PosTaggerInputTypeProcessorFactory.CreateInstance();
                //_Make_PosTagger = true;
            }
            else
            {
                _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance;
            }

            if ((config.TokenizeMode & TokenizeMode.Ner) == TokenizeMode.Ner)
            {
                _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance();
                //_Make_Ner = true;
            }
            else
            {
                _NerInputTypeProcessor = Dummy_NerInputTypeProcessor.Instance;
            }
        }
 public ner_tokenizer(SentSplitterConfig config, int wordCapacity)
 {
     _SentSplitter = new SentSplitter(config);
     _Words        = new List <word_t>(wordCapacity);
 }
        private bool _NotSkipNonLetterAndNonDigitToTheEnd;                        //need for NER-model-builder
        #endregion

        public ner_tokenizer(SentSplitterConfig config)
        {
            _SentSplitter = new SentSplitter(config);
            _Words        = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
        }