Exemplo n.º 1
0
        public Tokenizer(TokenizerConfig config)
        {
            _SentSplitter          = new SentSplitter(config.SentSplitterConfig);
            _Words                 = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _ParticleThatExclusion = config.Model.ParticleThatExclusion;
            _SentSplitterProcessSentCallback_Delegate = new SentSplitter.ProcessSentCallbackDelegate(SentSplitterProcessSentCallback);

            _UIM  = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;   //UnsafeConst.Inst._CRF_CHARTYPE_MAP;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            if ((config.TokenizeMode & TokenizeMode.PosTagger) == TokenizeMode.PosTagger)
            {
                _PosTaggerInputTypeProcessor = config.PosTaggerInputTypeProcessorFactory.CreateInstance();
                //_Make_PosTagger = true;
            }
            else
            {
                _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance;
            }

            if ((config.TokenizeMode & TokenizeMode.Ner) == TokenizeMode.Ner)
            {
                _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance();
                //_Make_Ner = true;
            }
            else
            {
                _NerInputTypeProcessor = Dummy_NerInputTypeProcessor.Instance;
            }
        }
Exemplo n.º 2
0
            private UnsafeConst(LanguageTypeEnum languageType)
            {
                var CRF_CHARTYPE_MAP = new byte[char.MaxValue + 1];

                fixed(byte *cctm = CRF_CHARTYPE_MAP)
                {
                    for (var c = char.MinValue; ; c++)
                    {
                        if (char.IsPunctuation(c))
                        {
                            *(cctm + c) = (byte)CRFCharType.InterpreteAsWhitespace;
                        }

                        if (c == char.MaxValue)
                        {
                            break;
                        }
                    }

                    foreach (var c in INCLUDE_INTERPRETE_AS_WHITESPACE)
                    {
                        *(cctm + c) = (byte)CRFCharType.InterpreteAsWhitespace;
                    }

                    foreach (var c in TOKENIZE_DIFFERENT_SEPARATELY)
                    {
                        *(cctm + c) = (byte)CRFCharType.TokenizeDifferentSeparately;
                    }

                    var between_letter_or_digit = (languageType == LanguageTypeEnum.En)
                        ? BETWEEN_LETTER_OR_DIGIT_EN
                        : BETWEEN_LETTER_OR_DIGIT;

                    foreach (var c in between_letter_or_digit)
                    {
                        *(cctm + c) |= (byte)CRFCharType.BetweenLetterOrDigit;
                    }

                    foreach (var c in BETWEEN_DIGIT)
                    {
                        *(cctm + c) |= (byte)CRFCharType.BetweenDigit;
                    }

                    *(cctm + DOT) = (byte)CRFCharType.DotChar;
                }

                var CRF_CHARTYPE_MAP_GCHandle = GCHandle.Alloc(CRF_CHARTYPE_MAP, GCHandleType.Pinned);

                _CRF_CHARTYPE_MAP = (CRFCharType *)CRF_CHARTYPE_MAP_GCHandle.AddrOfPinnedObject().ToPointer();
            }
Exemplo n.º 3
0
        private Tokenizer(TokenizerConfig4NerModelBuilder config)
        {
            config.UrlDetectorConfig.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position;

            _urlDetector        = new UrlDetector(config.UrlDetectorConfig);
            _buildModelSentence = Sentence.CreateEmpty();
            _words                 = new List <Word>(DEFAULT_WORDSLIST_CAPACITY);
            _buildModelWords       = new List <Buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _particleThatExclusion = config.Model.ParticleThatExclusion;

            _UIM  = XlatUnsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = XlatUnsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;

            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            _posTaggerInputTypeProcessor = DummyPosTaggerInputTypeProcessor.Instance;
            _nerInputTypeProcessor       = config.NerInputTypeProcessorFactory.CreateInstance();
        }
Exemplo n.º 4
0
        private Tokenizer(TokenizerConfig4NerModelBuilder config)
        {
            _UrlDetector = new UrlDetector(new UrlDetectorConfig()
            {
                Model = config.UrlDetectorConfig.Model, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position
            });
            _BuildModelSent        = sent_t.CreateEmpty();
            _Words                 = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _BuildModelWords       = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _ParticleThatExclusion = config.Model.ParticleThatExclusion;

            _UIM  = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;   //UnsafeConst.Inst._CRF_CHARTYPE_MAP;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance;
            _NerInputTypeProcessor       = config.NerInputTypeProcessorFactory.CreateInstance();
        }
Exemplo n.º 5
0
            private UnsafeConst(LanguageTypeEnum languageType)
            {
                //-1-//
                var CRF_CHARTYPE_MAP = new byte /*CRFCharType*/[char.MaxValue + 1];

                fixed(/*CRFCharType*/ byte *cctm = CRF_CHARTYPE_MAP)
                {
                    for (var c = char.MinValue; /*c <= char.MaxValue*/; c++)
                    {
                        if (/*char.IsWhiteSpace( c ) ||*/ char.IsPunctuation(c))
                        {
                            *(cctm + c) = (byte)CRFCharType.InterpreteAsWhitespace;
                        }

                        if (c == char.MaxValue)
                        {
                            break;
                        }
                    }

                    foreach (var c in INCLUDE_INTERPRETE_AS_WHITESPACE)
                    {
                        *(cctm + c) = (byte)CRFCharType.InterpreteAsWhitespace;
                    }

                    foreach (var c in TOKENIZE_DIFFERENT_SEPARATELY)
                    {
                        *(cctm + c) = (byte)CRFCharType.TokenizeDifferentSeparately;
                    }

                    var between_letter_or_digit = (languageType == LanguageTypeEnum.En)
                                                  ? BETWEEN_LETTER_OR_DIGIT_EN
                                                  : BETWEEN_LETTER_OR_DIGIT;

                    foreach (var c in between_letter_or_digit)
                    {
                        *(cctm + c) |= (byte)CRFCharType.BetweenLetterOrDigit;
                    }

                    foreach (var c in BETWEEN_DIGIT)
                    {
                        *(cctm + c) |= (byte)CRFCharType.BetweenDigit;
                    }

                    #region commented

                    /*
                     * foreach ( var c in EXCLUDE_INTERPRETE_AS_WHITESPACE )
                     * {
                     *  var cct = *(cctm + c);
                     *  if ( (cct & CRFCharType.BetweenNonWhitespace) == CRFCharType.BetweenNonWhitespace )
                     *(cctm + c) ^= CRFCharType.BetweenNonWhitespace;
                     *  else
                     *  if ( (cct & CRFCharType.InterpreteAsWhitespace) == CRFCharType.InterpreteAsWhitespace )
                     *(cctm + c) ^= CRFCharType.InterpreteAsWhitespace;
                     * }
                     */
                    #endregion

                    //-ERROR-!!!-*(cctm + DOT) |= (byte) CRFCharType.DotChar;
                    //-ONLY-SO--!!!-
                    *(cctm + DOT) = (byte)CRFCharType.DotChar;
                }

                var CRF_CHARTYPE_MAP_GCHandle = GCHandle.Alloc(CRF_CHARTYPE_MAP, GCHandleType.Pinned);
                _CRF_CHARTYPE_MAP = (CRFCharType *)CRF_CHARTYPE_MAP_GCHandle.AddrOfPinnedObject().ToPointer();
            }