public ner_tokenizer(UrlDetectorConfig config) { config.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position; _UrlDetector = new UrlDetector(config); _BuildModelSent = sent_t.CreateEmpty(); _Words = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY); _BuildModelWords = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY); }
private Tokenizer(TokenizerConfig4NerModelBuilder config) { config.UrlDetectorConfig.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position; _UrlDetector = new UrlDetector(config.UrlDetectorConfig); _BuildModelSent = sent_t.CreateEmpty(); _Words = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY); _BuildModelWords = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY); _ParticleThatExclusion = config.Model.ParticleThatExclusion; _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; //UnsafeConst.Inst._CRF_CHARTYPE_MAP; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance; _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); }
private void SentSplitterProcessSentCallback(sent_t sent) { _Words.Clear(); _StartIndex = sent.startIndex; _Length = 0; _StartPtr = _BASE + _StartIndex; _EndPtr = _StartPtr + sent.length - 1; var urls = sent.urls; var urlIndex = 0; var startUrlPtr = (urls != null) ? (_BASE + urls[0].startIndex) : UnsafeConst.MAX_PTR; #region [.main.] var realyEndPtr = _EndPtr; _EndPtr = SkipNonLetterAndNonDigitToTheEnd(); for (_Ptr = _StartPtr; _Ptr <= _EndPtr; _Ptr++) { #region [.process allocated url's.] if (startUrlPtr <= _Ptr) { #region [.code.] TryCreateWordAndPut2List(); var lenu = urls[urlIndex].length; #region [.skip-ignore url's.] /* #region [.create word. url.] * var lenu = urls[ urlIndex ].length; * var vu = new string( startUrlPtr, 0, lenu ); * var wu = new word_t() * { * startIndex = urls[ urlIndex ].startIndex, * length = lenu, * valueOriginal = vu, * valueUpper = vu, * posTaggerInputType = PosTaggerInputType.Url * }; * _Words.Add( wu ); #endregion * //*/ #endregion _Ptr = startUrlPtr + lenu - 1; urlIndex++; startUrlPtr = (urlIndex < urls.Count) ? (_BASE + urls[urlIndex].startIndex) : UnsafeConst.MAX_PTR; _StartIndex = (int)(_Ptr - _BASE + 1); _Length = 0; continue; #endregion } #endregion var ch = *_Ptr; var ct = *(_CTM + ch); #region [.whitespace.] if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace) { TryCreateWordAndPut2List(); _StartIndex++; continue; } #endregion var pct = *(_CCTM + ch); #region [.dot.] if ((pct & CRFCharType.DotChar) == CRFCharType.DotChar && IsUpperNextChar() ) { _Length++; TryCreateWordAndPut2List(); continue; } #endregion #region [.between-letter-or-digit.] if ((pct & CRFCharType.BetweenLetterOrDigit) == CRFCharType.BetweenLetterOrDigit) { if (IsBetweenLetterOrDigit()) { _Length++; } else { TryCreateWordAndPut2List(); #region [.merge punctuation (with white-space's).] if (!MergePunctuation(ch)) { break; } #endregion //punctuation word TryCreateWordAndPut2List(); } continue; } //с учетом того, что списки 'BetweenLetterOrDigit' и 'BetweenDigit' не пересекаются else if ((pct & CRFCharType.BetweenDigit) == CRFCharType.BetweenDigit) { if (IsBetweenDigit()) { _Length++; } else { TryCreateWordAndPut2List(); #region [.merge punctuation (with white-space's).] if (!MergePunctuation(ch)) { break; } #endregion //punctuation word TryCreateWordAndPut2List(); } continue; } #endregion #region [.tokenize-different-separately.] if ((pct & CRFCharType.TokenizeDifferentSeparately) == CRFCharType.TokenizeDifferentSeparately) { TryCreateWordAndPut2List(); #region [.merge punctuation (with white-space's).] if (!MergePunctuation(ch)) { break; } #region /* * _Length = 1; * _Ptr++; * for ( ; _Ptr <= _EndPtr; _Ptr++ ) * { * var ch_next = *_Ptr; * if ( ch_next != ch ) * break; * * _Length++; * } * if ( _EndPtr < _Ptr ) * { * if ( (_Length == 1) && (*_EndPtr == '\0') ) * _Length = 0; * break; * } * _Ptr--; */ #endregion #endregion //punctuation word TryCreateWordAndPut2List(); continue; } #endregion #region [.interprete-as-whitespace.] if ((pct & CRFCharType.InterpreteAsWhitespace) == CRFCharType.InterpreteAsWhitespace) { TryCreateWordAndPut2List(); _StartIndex++; continue; } #endregion #region [.increment length.] _Length++; #endregion } #endregion #region [.last word.] TryCreateWordAndPut2List(); #endregion #region [.tail punctuation.] for (_EndPtr = realyEndPtr; _Ptr <= _EndPtr; _Ptr++) { var ch = *_Ptr; var ct = *(_CTM + ch); #region [.whitespace.] if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace) { TryCreateWordAndPut2List(); _StartIndex++; continue; } #endregion var nct = *(_CCTM + ch); #region [.tokenize-different-separately.] if ((nct & CRFCharType.TokenizeDifferentSeparately) == CRFCharType.TokenizeDifferentSeparately) { TryCreateWordAndPut2List(); #region [.merge punctuation (with white-space's).] if (!MergePunctuation(ch)) { break; } #endregion //punctuation word TryCreateWordAndPut2List(); continue; } #endregion #region [.interprete-as-whitespace.] if ((nct & CRFCharType.InterpreteAsWhitespace) == CRFCharType.InterpreteAsWhitespace) { TryCreateWordAndPut2List(); _StartIndex++; continue; } #endregion #region [.increment length.] _Length++; #endregion } #endregion #region [.last punctuation.] TryCreateWordAndPut2List(); #endregion _OuterProcessSentCallback_Delegate(_Words); }
private void ProcessSentSplitterCallback(sent_t sent) { _Words.Clear(); _StartIndex = sent.startIndex; _Length = 0; _StartPtr = _BASE + _StartIndex; _EndPtr = _StartPtr + sent.length - 1; var urls = sent.urls; var urlIndex = 0; var startUrlPtr = (urls != null) ? (_BASE + urls[0].startIndex) : MAX_PTR; #region [.main.] var realyEndPtr = _EndPtr; _EndPtr = SkipNonLetterAndNonDigitToTheEnd(); for (_Ptr = _StartPtr; _Ptr <= _EndPtr; _Ptr++) { #region [.process allocated url's.] if (startUrlPtr <= _Ptr) { #region [.code.] TryCreateWordAndPut2List(); #region [.create word. url.] var lenu = urls[urlIndex].length; var vu = new string( startUrlPtr, 0, lenu ); var wu = new word_t() { startIndex = urls[urlIndex].startIndex, length = lenu, valueOriginal = vu, nerInputType = NerInputType.O, //URL }; _Words.Add(wu); #endregion _Ptr = startUrlPtr + lenu - 1; urlIndex++; startUrlPtr = (urlIndex < urls.Count) ? (_BASE + urls[urlIndex].startIndex) : MAX_PTR; _StartIndex = (int)(_Ptr - _BASE + 1); _Length = 0; continue; #endregion } #endregion var ch = *_Ptr; var ct = *(_CTM + ch); #region [.whitespace.] if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace) { TryCreateWordAndPut2List(); _StartIndex++; continue; } #endregion var nct = *(_NCTM + ch); #region [.dot.] if ((nct & NERCharType.DotChar) == NERCharType.DotChar && IsUpperNextChar() ) { _Length++; TryCreateWordAndPut2List(); continue; } #endregion #region [.between-non-whitespace.] if ((nct & NERCharType.BetweenLetterOrDigit) == NERCharType.BetweenLetterOrDigit) { if (IsBetweenLetterOrDigit()) { _Length++; } else { TryCreateWordAndPut2List(); #region [.merge punctuation (with white-space's).] if (!MergePunctuation(ch)) { break; } #endregion //punctuation word TryCreateWordAndPut2List(); } continue; } #endregion #region [.tokenize-different-separately.] if ((nct & NERCharType.TokenizeDifferentSeparately) == NERCharType.TokenizeDifferentSeparately) { TryCreateWordAndPut2List(); #region [.merge punctuation (with white-space's).] if (!MergePunctuation(ch)) { break; } #region /* * _Length = 1; * _Ptr++; * for ( ; _Ptr <= _EndPtr; _Ptr++ ) * { * var ch_next = *_Ptr; * if ( ch_next != ch ) * break; * * _Length++; * } * if ( _EndPtr < _Ptr ) * { * if ( (_Length == 1) && (*_EndPtr == '\0') ) * _Length = 0; * break; * } * _Ptr--; */ #endregion #endregion //punctuation word TryCreateWordAndPut2List(); continue; } #endregion #region [.interprete-as-whitespace.] if ((nct & NERCharType.InterpreteAsWhitespace) == NERCharType.InterpreteAsWhitespace) { TryCreateWordAndPut2List(); _StartIndex++; continue; } #endregion #region [.increment length.] _Length++; #endregion } #endregion #region [.last word.] TryCreateWordAndPut2List(); #endregion #region [.tail punctuation.] for (_EndPtr = realyEndPtr; _Ptr <= _EndPtr; _Ptr++) { var ch = *_Ptr; var ct = *(_CTM + ch); #region [.whitespace.] if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace) { TryCreateWordAndPut2List(); _StartIndex++; continue; } #endregion var nct = *(_NCTM + ch); #region [.tokenize-different-separately.] if ((nct & NERCharType.TokenizeDifferentSeparately) == NERCharType.TokenizeDifferentSeparately) { TryCreateWordAndPut2List(); #region [.merge punctuation (with white-space's).] if (!MergePunctuation(ch)) { break; } #endregion //punctuation word TryCreateWordAndPut2List(); continue; } #endregion #region [.interprete-as-whitespace.] if ((nct & NERCharType.InterpreteAsWhitespace) == NERCharType.InterpreteAsWhitespace) { TryCreateWordAndPut2List(); _StartIndex++; continue; } #endregion #region [.increment length.] _Length++; #endregion } #endregion #region [.last punctuation.] TryCreateWordAndPut2List(); #endregion _ProcessSentCallback(_Words); }