public ner_tokenizer(UrlDetectorConfig config)
        {
            config.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position;

            _UrlDetector     = new UrlDetector(config);
            _BuildModelSent  = sent_t.CreateEmpty();
            _Words           = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _BuildModelWords = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
        }
Esempio n. 2
0
        private Tokenizer(TokenizerConfig4NerModelBuilder config)
        {
            config.UrlDetectorConfig.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position;

            _UrlDetector           = new UrlDetector(config.UrlDetectorConfig);
            _BuildModelSent        = sent_t.CreateEmpty();
            _Words                 = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _BuildModelWords       = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _ParticleThatExclusion = config.Model.ParticleThatExclusion;

            _UIM  = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;   //UnsafeConst.Inst._CRF_CHARTYPE_MAP;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance;
            _NerInputTypeProcessor       = config.NerInputTypeProcessorFactory.CreateInstance();
        }
Esempio n. 3
0
        private void SentSplitterProcessSentCallback(sent_t sent)
        {
            _Words.Clear();
            _StartIndex = sent.startIndex;
            _Length     = 0;
            _StartPtr   = _BASE + _StartIndex;
            _EndPtr     = _StartPtr + sent.length - 1;

            var urls        = sent.urls;
            var urlIndex    = 0;
            var startUrlPtr = (urls != null) ? (_BASE + urls[0].startIndex) : UnsafeConst.MAX_PTR;

            #region [.main.]
            var realyEndPtr = _EndPtr;
            _EndPtr = SkipNonLetterAndNonDigitToTheEnd();

            for (_Ptr = _StartPtr; _Ptr <= _EndPtr; _Ptr++)
            {
                #region [.process allocated url's.]
                if (startUrlPtr <= _Ptr)
                {
                    #region [.code.]
                    TryCreateWordAndPut2List();

                    var lenu = urls[urlIndex].length;
                    #region [.skip-ignore url's.]

                    /*
                     #region [.create word. url.]
                     * var lenu = urls[ urlIndex ].length;
                     * var vu = new string( startUrlPtr, 0, lenu );
                     * var wu = new word_t()
                     * {
                     *  startIndex         = urls[ urlIndex ].startIndex,
                     *  length             = lenu,
                     *  valueOriginal      = vu,
                     *  valueUpper         = vu,
                     *  posTaggerInputType = PosTaggerInputType.Url
                     * };
                     * _Words.Add( wu );
                     #endregion
                     * //*/
                    #endregion

                    _Ptr = startUrlPtr + lenu - 1;
                    urlIndex++;
                    startUrlPtr = (urlIndex < urls.Count) ? (_BASE + urls[urlIndex].startIndex) : UnsafeConst.MAX_PTR;

                    _StartIndex = (int)(_Ptr - _BASE + 1);
                    _Length     = 0;
                    continue;

                    #endregion
                }
                #endregion

                var ch = *_Ptr;
                var ct = *(_CTM + ch);
                #region [.whitespace.]
                if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                var pct = *(_CCTM + ch);
                #region [.dot.]
                if ((pct & CRFCharType.DotChar) == CRFCharType.DotChar &&
                    IsUpperNextChar()
                    )
                {
                    _Length++;
                    TryCreateWordAndPut2List();
                    continue;
                }
                #endregion

                #region [.between-letter-or-digit.]
                if ((pct & CRFCharType.BetweenLetterOrDigit) == CRFCharType.BetweenLetterOrDigit)
                {
                    if (IsBetweenLetterOrDigit())
                    {
                        _Length++;
                    }
                    else
                    {
                        TryCreateWordAndPut2List();

                        #region [.merge punctuation (with white-space's).]
                        if (!MergePunctuation(ch))
                        {
                            break;
                        }
                        #endregion

                        //punctuation word
                        TryCreateWordAndPut2List();
                    }

                    continue;
                }
                //с учетом того, что списки 'BetweenLetterOrDigit' и 'BetweenDigit' не пересекаются
                else
                if ((pct & CRFCharType.BetweenDigit) == CRFCharType.BetweenDigit)
                {
                    if (IsBetweenDigit())
                    {
                        _Length++;
                    }
                    else
                    {
                        TryCreateWordAndPut2List();

                        #region [.merge punctuation (with white-space's).]
                        if (!MergePunctuation(ch))
                        {
                            break;
                        }
                        #endregion

                        //punctuation word
                        TryCreateWordAndPut2List();
                    }

                    continue;
                }
                #endregion

                #region [.tokenize-different-separately.]
                if ((pct & CRFCharType.TokenizeDifferentSeparately) == CRFCharType.TokenizeDifferentSeparately)
                {
                    TryCreateWordAndPut2List();

                    #region [.merge punctuation (with white-space's).]
                    if (!MergePunctuation(ch))
                    {
                        break;
                    }
                    #region

                    /*
                     * _Length = 1;
                     * _Ptr++;
                     * for ( ; _Ptr <= _EndPtr; _Ptr++ )
                     * {
                     *  var ch_next = *_Ptr;
                     *  if ( ch_next != ch )
                     *      break;
                     *
                     *  _Length++;
                     * }
                     * if ( _EndPtr < _Ptr )
                     * {
                     *  if ( (_Length == 1) && (*_EndPtr == '\0') )
                     *      _Length = 0;
                     *  break;
                     * }
                     * _Ptr--;
                     */
                    #endregion
                    #endregion

                    //punctuation word
                    TryCreateWordAndPut2List();

                    continue;
                }
                #endregion

                #region [.interprete-as-whitespace.]
                if ((pct & CRFCharType.InterpreteAsWhitespace) == CRFCharType.InterpreteAsWhitespace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                #region [.increment length.]
                _Length++;
                #endregion
            }
            #endregion

            #region [.last word.]
            TryCreateWordAndPut2List();
            #endregion

            #region [.tail punctuation.]
            for (_EndPtr = realyEndPtr; _Ptr <= _EndPtr; _Ptr++)
            {
                var ch = *_Ptr;
                var ct = *(_CTM + ch);
                #region [.whitespace.]
                if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                var nct = *(_CCTM + ch);
                #region [.tokenize-different-separately.]
                if ((nct & CRFCharType.TokenizeDifferentSeparately) == CRFCharType.TokenizeDifferentSeparately)
                {
                    TryCreateWordAndPut2List();

                    #region [.merge punctuation (with white-space's).]
                    if (!MergePunctuation(ch))
                    {
                        break;
                    }
                    #endregion

                    //punctuation word
                    TryCreateWordAndPut2List();

                    continue;
                }
                #endregion

                #region [.interprete-as-whitespace.]
                if ((nct & CRFCharType.InterpreteAsWhitespace) == CRFCharType.InterpreteAsWhitespace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                #region [.increment length.]
                _Length++;
                #endregion
            }
            #endregion

            #region [.last punctuation.]
            TryCreateWordAndPut2List();
            #endregion

            _OuterProcessSentCallback_Delegate(_Words);
        }
        private void ProcessSentSplitterCallback(sent_t sent)
        {
            _Words.Clear();
            _StartIndex = sent.startIndex;
            _Length     = 0;
            _StartPtr   = _BASE + _StartIndex;
            _EndPtr     = _StartPtr + sent.length - 1;

            var urls        = sent.urls;
            var urlIndex    = 0;
            var startUrlPtr = (urls != null) ? (_BASE + urls[0].startIndex) : MAX_PTR;

            #region [.main.]
            var realyEndPtr = _EndPtr;
            _EndPtr = SkipNonLetterAndNonDigitToTheEnd();

            for (_Ptr = _StartPtr; _Ptr <= _EndPtr; _Ptr++)
            {
                #region [.process allocated url's.]
                if (startUrlPtr <= _Ptr)
                {
                    #region [.code.]
                    TryCreateWordAndPut2List();

                    #region [.create word. url.]
                    var lenu = urls[urlIndex].length;
                    var vu   = new string( startUrlPtr, 0, lenu );
                    var wu   = new word_t()
                    {
                        startIndex    = urls[urlIndex].startIndex,
                        length        = lenu,
                        valueOriginal = vu,
                        nerInputType  = NerInputType.O, //URL
                    };
                    _Words.Add(wu);
                    #endregion

                    _Ptr = startUrlPtr + lenu - 1;
                    urlIndex++;
                    startUrlPtr = (urlIndex < urls.Count) ? (_BASE + urls[urlIndex].startIndex) : MAX_PTR;

                    _StartIndex = (int)(_Ptr - _BASE + 1);
                    _Length     = 0;
                    continue;

                    #endregion
                }
                #endregion

                var ch = *_Ptr;
                var ct = *(_CTM + ch);
                #region [.whitespace.]
                if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                var nct = *(_NCTM + ch);
                #region [.dot.]
                if ((nct & NERCharType.DotChar) == NERCharType.DotChar &&
                    IsUpperNextChar()
                    )
                {
                    _Length++;
                    TryCreateWordAndPut2List();
                    continue;
                }
                #endregion

                #region [.between-non-whitespace.]
                if ((nct & NERCharType.BetweenLetterOrDigit) == NERCharType.BetweenLetterOrDigit)
                {
                    if (IsBetweenLetterOrDigit())
                    {
                        _Length++;
                    }
                    else
                    {
                        TryCreateWordAndPut2List();

                        #region [.merge punctuation (with white-space's).]
                        if (!MergePunctuation(ch))
                        {
                            break;
                        }
                        #endregion

                        //punctuation word
                        TryCreateWordAndPut2List();
                    }

                    continue;
                }
                #endregion

                #region [.tokenize-different-separately.]
                if ((nct & NERCharType.TokenizeDifferentSeparately) == NERCharType.TokenizeDifferentSeparately)
                {
                    TryCreateWordAndPut2List();

                    #region [.merge punctuation (with white-space's).]
                    if (!MergePunctuation(ch))
                    {
                        break;
                    }
                    #region

                    /*
                     * _Length = 1;
                     * _Ptr++;
                     * for ( ; _Ptr <= _EndPtr; _Ptr++ )
                     * {
                     *  var ch_next = *_Ptr;
                     *  if ( ch_next != ch )
                     *      break;
                     *
                     *  _Length++;
                     * }
                     * if ( _EndPtr < _Ptr )
                     * {
                     *  if ( (_Length == 1) && (*_EndPtr == '\0') )
                     *      _Length = 0;
                     *  break;
                     * }
                     * _Ptr--;
                     */
                    #endregion
                    #endregion

                    //punctuation word
                    TryCreateWordAndPut2List();

                    continue;
                }
                #endregion

                #region [.interprete-as-whitespace.]
                if ((nct & NERCharType.InterpreteAsWhitespace) == NERCharType.InterpreteAsWhitespace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                #region [.increment length.]
                _Length++;
                #endregion
            }
            #endregion

            #region [.last word.]
            TryCreateWordAndPut2List();
            #endregion

            #region [.tail punctuation.]
            for (_EndPtr = realyEndPtr; _Ptr <= _EndPtr; _Ptr++)
            {
                var ch = *_Ptr;
                var ct = *(_CTM + ch);
                #region [.whitespace.]
                if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                var nct = *(_NCTM + ch);
                #region [.tokenize-different-separately.]
                if ((nct & NERCharType.TokenizeDifferentSeparately) == NERCharType.TokenizeDifferentSeparately)
                {
                    TryCreateWordAndPut2List();

                    #region [.merge punctuation (with white-space's).]
                    if (!MergePunctuation(ch))
                    {
                        break;
                    }
                    #endregion

                    //punctuation word
                    TryCreateWordAndPut2List();

                    continue;
                }
                #endregion

                #region [.interprete-as-whitespace.]
                if ((nct & NERCharType.InterpreteAsWhitespace) == NERCharType.InterpreteAsWhitespace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                #region [.increment length.]
                _Length++;
                #endregion
            }
            #endregion

            #region [.last punctuation.]
            TryCreateWordAndPut2List();
            #endregion

            _ProcessSentCallback(_Words);
        }