Пример #1
0
        private void TryCreateWordAndPut2List()
        {
            if (_Length != 0)
            {
                #region [.create word.]
                var v = new string( _BASE, _StartIndex, _Length );
                var w = new word_t()
                {
                    startIndex    = _StartIndex,
                    length        = _Length,
                    valueOriginal = v,
                    nerInputType  = NerInputTypeProcessor.GetNerInputType(_CTM, _BASE + _StartIndex, _Length),
                };
                _Words.Add(w);
                #endregion

                _StartIndex += _Length;
                _Length      = 0;
            }
        }
Пример #2
0
        private void ProcessSentSplitterCallback(sent_t sent)
        {
            _Words.Clear();
            _StartIndex = sent.startIndex;
            _Length     = 0;
            _StartPtr   = _BASE + _StartIndex;
            _EndPtr     = _StartPtr + sent.length - 1;

            var urls        = sent.urls;
            var urlIndex    = 0;
            var startUrlPtr = (urls != null) ? (_BASE + urls[0].startIndex) : MAX_PTR;

            #region [.main.]
            var realyEndPtr = _EndPtr;
            _EndPtr = SkipNonLetterAndNonDigitToTheEnd();

            for (_Ptr = _StartPtr; _Ptr <= _EndPtr; _Ptr++)
            {
                #region [.process allocated url's.]
                if (startUrlPtr <= _Ptr)
                {
                    #region [.code.]
                    TryCreateWordAndPut2List();

                    #region [.create word. url.]
                    var lenu = urls[urlIndex].length;
                    var vu   = new string( startUrlPtr, 0, lenu );
                    var wu   = new word_t()
                    {
                        startIndex    = urls[urlIndex].startIndex,
                        length        = lenu,
                        valueOriginal = vu,
                        nerInputType  = NerInputType.O, //URL
                    };
                    _Words.Add(wu);
                    #endregion

                    _Ptr = startUrlPtr + lenu - 1;
                    urlIndex++;
                    startUrlPtr = (urlIndex < urls.Count) ? (_BASE + urls[urlIndex].startIndex) : MAX_PTR;

                    _StartIndex = (int)(_Ptr - _BASE + 1);
                    _Length     = 0;
                    continue;

                    #endregion
                }
                #endregion

                var ch = *_Ptr;
                var ct = *(_CTM + ch);
                #region [.whitespace.]
                if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                var nct = *(_NCTM + ch);
                #region [.dot.]
                if ((nct & NERCharType.DotChar) == NERCharType.DotChar &&
                    IsUpperNextChar()
                    )
                {
                    _Length++;
                    TryCreateWordAndPut2List();
                    continue;
                }
                #endregion

                #region [.between-non-whitespace.]
                if ((nct & NERCharType.BetweenLetterOrDigit) == NERCharType.BetweenLetterOrDigit)
                {
                    if (IsBetweenLetterOrDigit())
                    {
                        _Length++;
                    }
                    else
                    {
                        TryCreateWordAndPut2List();

                        #region [.merge punctuation (with white-space's).]
                        if (!MergePunctuation(ch))
                        {
                            break;
                        }
                        #endregion

                        //punctuation word
                        TryCreateWordAndPut2List();
                    }

                    continue;
                }
                #endregion

                #region [.tokenize-different-separately.]
                if ((nct & NERCharType.TokenizeDifferentSeparately) == NERCharType.TokenizeDifferentSeparately)
                {
                    TryCreateWordAndPut2List();

                    #region [.merge punctuation (with white-space's).]
                    if (!MergePunctuation(ch))
                    {
                        break;
                    }
                    #region

                    /*
                     * _Length = 1;
                     * _Ptr++;
                     * for ( ; _Ptr <= _EndPtr; _Ptr++ )
                     * {
                     *  var ch_next = *_Ptr;
                     *  if ( ch_next != ch )
                     *      break;
                     *
                     *  _Length++;
                     * }
                     * if ( _EndPtr < _Ptr )
                     * {
                     *  if ( (_Length == 1) && (*_EndPtr == '\0') )
                     *      _Length = 0;
                     *  break;
                     * }
                     * _Ptr--;
                     */
                    #endregion
                    #endregion

                    //punctuation word
                    TryCreateWordAndPut2List();

                    continue;
                }
                #endregion

                #region [.interprete-as-whitespace.]
                if ((nct & NERCharType.InterpreteAsWhitespace) == NERCharType.InterpreteAsWhitespace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                #region [.increment length.]
                _Length++;
                #endregion
            }
            #endregion

            #region [.last word.]
            TryCreateWordAndPut2List();
            #endregion

            #region [.tail punctuation.]
            for (_EndPtr = realyEndPtr; _Ptr <= _EndPtr; _Ptr++)
            {
                var ch = *_Ptr;
                var ct = *(_CTM + ch);
                #region [.whitespace.]
                if ((ct & CharType.IsWhiteSpace) == CharType.IsWhiteSpace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                var nct = *(_NCTM + ch);
                #region [.tokenize-different-separately.]
                if ((nct & NERCharType.TokenizeDifferentSeparately) == NERCharType.TokenizeDifferentSeparately)
                {
                    TryCreateWordAndPut2List();

                    #region [.merge punctuation (with white-space's).]
                    if (!MergePunctuation(ch))
                    {
                        break;
                    }
                    #endregion

                    //punctuation word
                    TryCreateWordAndPut2List();

                    continue;
                }
                #endregion

                #region [.interprete-as-whitespace.]
                if ((nct & NERCharType.InterpreteAsWhitespace) == NERCharType.InterpreteAsWhitespace)
                {
                    TryCreateWordAndPut2List();

                    _StartIndex++;
                    continue;
                }
                #endregion

                #region [.increment length.]
                _Length++;
                #endregion
            }
            #endregion

            #region [.last punctuation.]
            TryCreateWordAndPut2List();
            #endregion

            _ProcessSentCallback(_Words);
        }