public Tokenizer(TokenizerConfig config) { _SentSplitter = new SentSplitter(config.SentSplitterConfig); _Words = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY); _ParticleThatExclusion = config.Model.ParticleThatExclusion; _SentSplitterProcessSentCallback_Delegate = new SentSplitter.ProcessSentCallbackDelegate(SentSplitterProcessSentCallback); _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; //UnsafeConst.Inst._CRF_CHARTYPE_MAP; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); if ((config.TokenizeMode & TokenizeMode.PosTagger) == TokenizeMode.PosTagger) { _PosTaggerInputTypeProcessor = config.PosTaggerInputTypeProcessorFactory.CreateInstance(); //_Make_PosTagger = true; } else { _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance; } if ((config.TokenizeMode & TokenizeMode.Ner) == TokenizeMode.Ner) { _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); //_Make_Ner = true; } else { _NerInputTypeProcessor = Dummy_NerInputTypeProcessor.Instance; } }
private UnsafeConst(LanguageTypeEnum languageType) { var CRF_CHARTYPE_MAP = new byte[char.MaxValue + 1]; fixed(byte *cctm = CRF_CHARTYPE_MAP) { for (var c = char.MinValue; ; c++) { if (char.IsPunctuation(c)) { *(cctm + c) = (byte)CRFCharType.InterpreteAsWhitespace; } if (c == char.MaxValue) { break; } } foreach (var c in INCLUDE_INTERPRETE_AS_WHITESPACE) { *(cctm + c) = (byte)CRFCharType.InterpreteAsWhitespace; } foreach (var c in TOKENIZE_DIFFERENT_SEPARATELY) { *(cctm + c) = (byte)CRFCharType.TokenizeDifferentSeparately; } var between_letter_or_digit = (languageType == LanguageTypeEnum.En) ? BETWEEN_LETTER_OR_DIGIT_EN : BETWEEN_LETTER_OR_DIGIT; foreach (var c in between_letter_or_digit) { *(cctm + c) |= (byte)CRFCharType.BetweenLetterOrDigit; } foreach (var c in BETWEEN_DIGIT) { *(cctm + c) |= (byte)CRFCharType.BetweenDigit; } *(cctm + DOT) = (byte)CRFCharType.DotChar; } var CRF_CHARTYPE_MAP_GCHandle = GCHandle.Alloc(CRF_CHARTYPE_MAP, GCHandleType.Pinned); _CRF_CHARTYPE_MAP = (CRFCharType *)CRF_CHARTYPE_MAP_GCHandle.AddrOfPinnedObject().ToPointer(); }
private Tokenizer(TokenizerConfig4NerModelBuilder config) { config.UrlDetectorConfig.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position; _urlDetector = new UrlDetector(config.UrlDetectorConfig); _buildModelSentence = Sentence.CreateEmpty(); _words = new List <Word>(DEFAULT_WORDSLIST_CAPACITY); _buildModelWords = new List <Buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY); _particleThatExclusion = config.Model.ParticleThatExclusion; _UIM = XlatUnsafe.Inst._UPPER_INVARIANT_MAP; _CTM = XlatUnsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); _posTaggerInputTypeProcessor = DummyPosTaggerInputTypeProcessor.Instance; _nerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); }
private Tokenizer(TokenizerConfig4NerModelBuilder config) { _UrlDetector = new UrlDetector(new UrlDetectorConfig() { Model = config.UrlDetectorConfig.Model, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position }); _BuildModelSent = sent_t.CreateEmpty(); _Words = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY); _BuildModelWords = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY); _ParticleThatExclusion = config.Model.ParticleThatExclusion; _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; //UnsafeConst.Inst._CRF_CHARTYPE_MAP; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance; _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); }
private UnsafeConst(LanguageTypeEnum languageType) { //-1-// var CRF_CHARTYPE_MAP = new byte /*CRFCharType*/[char.MaxValue + 1]; fixed(/*CRFCharType*/ byte *cctm = CRF_CHARTYPE_MAP) { for (var c = char.MinValue; /*c <= char.MaxValue*/; c++) { if (/*char.IsWhiteSpace( c ) ||*/ char.IsPunctuation(c)) { *(cctm + c) = (byte)CRFCharType.InterpreteAsWhitespace; } if (c == char.MaxValue) { break; } } foreach (var c in INCLUDE_INTERPRETE_AS_WHITESPACE) { *(cctm + c) = (byte)CRFCharType.InterpreteAsWhitespace; } foreach (var c in TOKENIZE_DIFFERENT_SEPARATELY) { *(cctm + c) = (byte)CRFCharType.TokenizeDifferentSeparately; } var between_letter_or_digit = (languageType == LanguageTypeEnum.En) ? BETWEEN_LETTER_OR_DIGIT_EN : BETWEEN_LETTER_OR_DIGIT; foreach (var c in between_letter_or_digit) { *(cctm + c) |= (byte)CRFCharType.BetweenLetterOrDigit; } foreach (var c in BETWEEN_DIGIT) { *(cctm + c) |= (byte)CRFCharType.BetweenDigit; } #region commented /* * foreach ( var c in EXCLUDE_INTERPRETE_AS_WHITESPACE ) * { * var cct = *(cctm + c); * if ( (cct & CRFCharType.BetweenNonWhitespace) == CRFCharType.BetweenNonWhitespace ) *(cctm + c) ^= CRFCharType.BetweenNonWhitespace; * else * if ( (cct & CRFCharType.InterpreteAsWhitespace) == CRFCharType.InterpreteAsWhitespace ) *(cctm + c) ^= CRFCharType.InterpreteAsWhitespace; * } */ #endregion //-ERROR-!!!-*(cctm + DOT) |= (byte) CRFCharType.DotChar; //-ONLY-SO--!!!- *(cctm + DOT) = (byte)CRFCharType.DotChar; } var CRF_CHARTYPE_MAP_GCHandle = GCHandle.Alloc(CRF_CHARTYPE_MAP, GCHandleType.Pinned); _CRF_CHARTYPE_MAP = (CRFCharType *)CRF_CHARTYPE_MAP_GCHandle.AddrOfPinnedObject().ToPointer(); }