public Tokenizer(TokenizerConfig config) { _SentSplitter = new SentSplitter(config.SentSplitterConfig); _Words = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY); _ParticleThatExclusion = config.Model.ParticleThatExclusion; _SentSplitterProcessSentCallback_Delegate = new SentSplitter.ProcessSentCallbackDelegate(SentSplitterProcessSentCallback); _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; //UnsafeConst.Inst._CRF_CHARTYPE_MAP; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); if ((config.TokenizeMode & TokenizeMode.PosTagger) == TokenizeMode.PosTagger) { _PosTaggerInputTypeProcessor = config.PosTaggerInputTypeProcessorFactory.CreateInstance(); //_Make_PosTagger = true; } else { _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance; } if ((config.TokenizeMode & TokenizeMode.Ner) == TokenizeMode.Ner) { _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); //_Make_Ner = true; } else { _NerInputTypeProcessor = Dummy_NerInputTypeProcessor.Instance; } }
public ner_tokenizer(SentSplitterConfig config, int wordCapacity) { _SentSplitter = new SentSplitter(config); _Words = new List <word_t>(wordCapacity); }
private bool _NotSkipNonLetterAndNonDigitToTheEnd; //need for NER-model-builder #endregion public ner_tokenizer(SentSplitterConfig config) { _SentSplitter = new SentSplitter(config); _Words = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY); }