Exemplo n.º 1
0
        private char *_ptr;              //current pointer into text

        #region ctor dtor dispose
        public UrlDetector(UrlDetectorConfig config)
        {
            _extractValue = (config.UrlExtractMode == UrlExtractModeEnum.ValueAndPosition);

            _firstLevelDomains          = config.Model.FirstLevelDomains;
            _firstLevelDomainsMaxLength = config.Model.FirstLevelDomainsMaxLength;

            _URIschemes          = config.Model.URIschemes;
            _URIschemesMaxLength = config.Model.URIschemesMaxLength;

            _urls          = new List <Url>(DEFAULT_LIST_CAPACITY);
            _stringBuilder = new StringBuilder();
            _url           = new Url();
            _urlStructs    = new List <UrlStruct>(DEFAULT_LIST_CAPACITY);

            _CTM = XlatUnsafe.Inst._CHARTYPE_MAP;
            _UIM = XlatUnsafe.Inst._UPPER_INVARIANT_MAP;

            _firstLevelDomainBuffer         = new char[_firstLevelDomainsMaxLength + 1];
            _firstLevelDomainBufferGCHandle = GCHandle.Alloc(_firstLevelDomainBuffer, GCHandleType.Pinned);
            _fldBufferPtrBase = (char *)_firstLevelDomainBufferGCHandle.AddrOfPinnedObject().ToPointer();

            _URIschemesBuffer         = new char[_URIschemesMaxLength + 1];
            _URIschemesBufferGCHandle = GCHandle.Alloc(_URIschemesBuffer, GCHandleType.Pinned);
            _uriSchBufferPtrBase      = (char *)_URIschemesBufferGCHandle.AddrOfPinnedObject().ToPointer();
        }
Exemplo n.º 2
0
        public Tokenizer(TokenizerConfig config)
        {
            _SentSplitter          = new SentSplitter(config.SentSplitterConfig);
            _Words                 = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _ParticleThatExclusion = config.Model.ParticleThatExclusion;
            _SentSplitterProcessSentCallback_Delegate = new SentSplitter.ProcessSentCallbackDelegate(SentSplitterProcessSentCallback);

            _UIM  = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;   //UnsafeConst.Inst._CRF_CHARTYPE_MAP;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            if ((config.TokenizeMode & TokenizeMode.PosTagger) == TokenizeMode.PosTagger)
            {
                _PosTaggerInputTypeProcessor = config.PosTaggerInputTypeProcessorFactory.CreateInstance();
                //_Make_PosTagger = true;
            }
            else
            {
                _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance;
            }

            if ((config.TokenizeMode & TokenizeMode.Ner) == TokenizeMode.Ner)
            {
                _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance();
                //_Make_Ner = true;
            }
            else
            {
                _NerInputTypeProcessor = Dummy_NerInputTypeProcessor.Instance;
            }
        }
Exemplo n.º 3
0
        unsafe private static string NormalizeTerm(CharType *ctm, char *uim, string term)
        {
            var len_minus_1 = term.Length - 1;

            fixed(char *ptr = term)
            {
                var start = 0;

                for ( ; start <= len_minus_1; start++)
                {
                    //if ( char.IsLetter( *(ptr + start) ) )
                    //if ( (xlat.CHARTYPE_MAP[ *(ptr + start) ] & CharType.IsLetter) == CharType.IsLetter )
                    if ((*(ctm + *(ptr + start)) & CharType.IsLetter) == CharType.IsLetter)
                    {
                        break;
                    }
                }

                var end = len_minus_1;

                for ( ; start < end; end--)
                {
                    //if ( char.IsLetter( *(ptr + end) ) )
                    //if ( (xlat.CHARTYPE_MAP[ *(ptr + end) ] & CharType.IsLetter) == CharType.IsLetter )
                    if ((*(ctm + *(ptr + end)) & CharType.IsLetter) == CharType.IsLetter)
                    {
                        break;
                    }
                }

                if (start != 0 || end != len_minus_1)
                {
                    if (end <= start)
                    {
                        return(null);
                    }

                    for (var i = start; i <= end; i++)
                    {
                        //*(ptr + i) = char.ToUpperInvariant( *(ptr + i) );
                        //*(ptr + i) = xlat.UPPER_INVARIANT_MAP[ *(ptr + i) ];
                        *(ptr + i) = *(uim + *(ptr + i));
                    }

                    var normTerm = new string( ptr, start, end - start + 1 );
                    return(normTerm);
                }
                else
                {
                    for (var i = 0; i <= len_minus_1; i++)
                    {
                        //*(ptr + i) = char.ToUpperInvariant( *(ptr + i) );
                        //*(ptr + i) = xlat.UPPER_INVARIANT_MAP[ *(ptr + i) ];
                        *(ptr + i) = *(uim + *(ptr + i));
                    }
                    return(term);
                }
            }
        }
Exemplo n.º 4
0
 public PosTaggerMorphoAnalyzer(IMorphoModel morphoModel, MorphoAmbiguityResolverModel morphoAmbiguityModel)
 {
     _morphoModel    = morphoModel;
     _morphoAnalyzer = new MorphoAnalyzer(_morphoModel);
     _morphoAmbiguityPreProcessor = new MorphoAmbiguityPreProcessor();
     _morphoAmbiguityResolver     = new MorphoAmbiguityResolver(morphoAmbiguityModel);
     _wordFormMorphologies_Buffer = new List <WordFormMorphology>();
     _wordMorphoAmbiguityFactory  = new WordMorphoAmbiguityFactory(null);
     _wordMorphoAmbiguities       = new List <WordMorphoAmbiguity>();
     CTM = XlatUnsafe.Inst._CHARTYPE_MAP;
 }
Exemplo n.º 5
0
        private xlat_Unsafe()
        {
            //string POSINPUTTYPE_OTHER = PosTaggerInputType.O.ToText();
            //string NERINPUTTYPE_OTHER = NerInputType.O.ToText();

            //-1-
            var inputtypeOtherBytes         = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER);
            var inputtypeOtherBytesGCHandle = GCHandle.Alloc(inputtypeOtherBytes, GCHandleType.Pinned);

            _InputtypeOtherPtrBase = (byte *)inputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer();

            //-1-
            var posInputtypeOtherBytes         = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER /*POSINPUTTYPE_OTHER*/);
            var posInputtypeOtherBytesGCHandle = GCHandle.Alloc(posInputtypeOtherBytes, GCHandleType.Pinned);

            _PosInputtypeOtherPtrBase = (byte *)posInputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer();

            //-1-
            var nerInputtypeOtherBytes         = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER /*NERINPUTTYPE_OTHER*/);
            var nerInputtypeOtherBytesGCHandle = GCHandle.Alloc(nerInputtypeOtherBytes, GCHandleType.Pinned);

            _NerInputtypeOtherPtrBase = (byte *)nerInputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer();

            //-2-
            var beginOfSentenceBytes         = Encoding.UTF8.GetBytes(BEGIN_OF_SENTENCE);
            var beginOfSentenceBytesGCHandle = GCHandle.Alloc(beginOfSentenceBytes, GCHandleType.Pinned);

            _BeginOfSentencePtrBase = (byte *)beginOfSentenceBytesGCHandle.AddrOfPinnedObject().ToPointer();

            //-3-
            var endOfSentenceBytes         = Encoding.UTF8.GetBytes(END_OF_SENTENCE);
            var endOfSentenceBytesGCHandle = GCHandle.Alloc(endOfSentenceBytes, GCHandleType.Pinned);

            _EndOfSentencePtrBase = (byte *)endOfSentenceBytesGCHandle.AddrOfPinnedObject().ToPointer();

#if XLAT_CHARTYPE_MAP
            //-4-
            var x = new ushort[xlat.CHARTYPE_MAP.Length];
            for (int i = 0; i < x.Length; i++)
            {
                x[i] = (ushort)xlat.CHARTYPE_MAP[i];
            }
            var ctmGCHandle = GCHandle.Alloc(x /*xlat.CHARTYPE_MAP*/, GCHandleType.Pinned);
            _CHARTYPE_MAP = (CharType *)ctmGCHandle.AddrOfPinnedObject().ToPointer();
            //_CHARTYPE_MAP = (CharType*) Marshal.UnsafeAddrOfPinnedArrayElement( xlat.CHARTYPE_MAP, 0 ).ToPointer();
#endif
#if XLAT_UPPER_INVARIANT_MAP
            //-5-
            var uimGCHandle = GCHandle.Alloc(xlat.UPPER_INVARIANT_MAP, GCHandleType.Pinned);
            _UPPER_INVARIANT_MAP = (char *)uimGCHandle.AddrOfPinnedObject().ToPointer();
            //---_UPPER_INVARIANT_MAP = (char*) Marshal.UnsafeAddrOfPinnedArrayElement( UPPER_INVARIANT_MAP, 0 ).ToPointer();
#endif
        }
Exemplo n.º 6
0
        private xlat_Unsafe()
        {
            //-1-
            var inputtypeOtherBytes         = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER);
            var inputtypeOtherBytesGCHandle = GCHandle.Alloc(inputtypeOtherBytes, GCHandleType.Pinned);

            _InputtypeOtherPtrBase = (byte *)inputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer();

            //-1-
            var posInputtypeOtherBytes         = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER /*POSINPUTTYPE_OTHER*/);
            var posInputtypeOtherBytesGCHandle = GCHandle.Alloc(posInputtypeOtherBytes, GCHandleType.Pinned);

            _PosInputtypeOtherPtrBase = (byte *)posInputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer();

            //-1-
            var nerInputtypeOtherBytes         = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER /*NERINPUTTYPE_OTHER*/);
            var nerInputtypeOtherBytesGCHandle = GCHandle.Alloc(nerInputtypeOtherBytes, GCHandleType.Pinned);

            _NerInputtypeOtherPtrBase = (byte *)nerInputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer();

            //-2-
            var beginOfSentenceBytes         = Encoding.UTF8.GetBytes(BEGIN_OF_SENTENCE);
            var beginOfSentenceBytesGCHandle = GCHandle.Alloc(beginOfSentenceBytes, GCHandleType.Pinned);

            _BeginOfSentencePtrBase = (byte *)beginOfSentenceBytesGCHandle.AddrOfPinnedObject().ToPointer();

            //-3-
            var endOfSentenceBytes         = Encoding.UTF8.GetBytes(END_OF_SENTENCE);
            var endOfSentenceBytesGCHandle = GCHandle.Alloc(endOfSentenceBytes, GCHandleType.Pinned);

            _EndOfSentencePtrBase = (byte *)endOfSentenceBytesGCHandle.AddrOfPinnedObject().ToPointer();

#if XLAT_CHARTYPE_MAP
            //-4-
            var _x_ = new ushort[xlat.CHARTYPE_MAP.Length];
            for (int i = 0; i < _x_.Length; i++)
            {
                _x_[i] = (ushort)xlat.CHARTYPE_MAP[i];
            }
            var ctmGCHandle = GCHandle.Alloc(_x_ /*xlat.CHARTYPE_MAP*/, GCHandleType.Pinned);
            _CHARTYPE_MAP = (CharType *)ctmGCHandle.AddrOfPinnedObject().ToPointer();
#endif
#if XLAT_UPPER_INVARIANT_MAP
            //-5-
            var uimGCHandle = GCHandle.Alloc(xlat.UPPER_INVARIANT_MAP, GCHandleType.Pinned);
            _UPPER_INVARIANT_MAP = (char *)uimGCHandle.AddrOfPinnedObject().ToPointer();
#endif
#if XLAT_LOWER_INVARIANT_MAP
            //-5-
            var limGCHandle = GCHandle.Alloc(xlat.LOWER_INVARIANT_MAP, GCHandleType.Pinned);
            _LOWER_INVARIANT_MAP = (char *)limGCHandle.AddrOfPinnedObject().ToPointer();
#endif
        }
Exemplo n.º 7
0
        public void run(string text, bool splitBySmiles, ProcessSentCallbackDelegate processSentCallback)
        {
            _ProcessSentCallback = processSentCallback;

            fixed(char *_base = text)
            fixed(CharType * ctm     = xlat.CHARTYPE_MAP)
            fixed(NERCharType * nctm = NER_CHARTYPE_MAP)
            {
                _BASE = _base;
                _CTM  = ctm;
                _NCTM = nctm;

                _SentSplitter.SplitBySmiles = splitBySmiles;
                _SentSplitter.AllocateSents(text, ProcessSentSplitterCallback);
            }

            _ProcessSentCallback = null;
        }
Exemplo n.º 8
0
        private Tokenizer(TokenizerConfig4NerModelBuilder config)
        {
            config.UrlDetectorConfig.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position;

            _urlDetector        = new UrlDetector(config.UrlDetectorConfig);
            _buildModelSentence = Sentence.CreateEmpty();
            _words                 = new List <Word>(DEFAULT_WORDSLIST_CAPACITY);
            _buildModelWords       = new List <Buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _particleThatExclusion = config.Model.ParticleThatExclusion;

            _UIM  = XlatUnsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = XlatUnsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;

            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            _posTaggerInputTypeProcessor = DummyPosTaggerInputTypeProcessor.Instance;
            _nerInputTypeProcessor       = config.NerInputTypeProcessorFactory.CreateInstance();
        }
Exemplo n.º 9
0
        public List <buildmodel_word_t> run4ModelBuilder(
            string partOfSentText,
            bool isLastPartOfSentText,
            NerOutputType nerOutputType,
            bool prevPartOfSentTextSameNerOutputType)
        {
            _BuildModelWords.Clear();
            if (prevPartOfSentTextSameNerOutputType)
            {
                _BuildModelNerInputTypeI = nerOutputType.ToBuildModelNerInputTypeI();
                _BuildModelNerInputTypeB = _BuildModelNerInputTypeI;
            }
            else
            {
                _BuildModelNerInputTypeI = nerOutputType.ToBuildModelNerInputTypeI();
                _BuildModelNerInputTypeB = nerOutputType.ToBuildModelNerInputTypeB();
            }

            _ProcessSentCallback = ProcessSentCallbackModelBuilder;

            fixed(char *_base = partOfSentText)
            fixed(CharType * ctm     = xlat.CHARTYPE_MAP)
            fixed(NERCharType * nctm = NER_CHARTYPE_MAP)
            {
                _BASE = _base;
                _CTM  = ctm;
                _NCTM = nctm;
                _NotSkipNonLetterAndNonDigitToTheEnd = !isLastPartOfSentText;

                var urls = _UrlDetector.AllocateUrls(partOfSentText);

                _BuildModelSent.Set4ModelBuilder(0, partOfSentText.Length, (0 < urls.Count) ? urls : null);

                ProcessSentSplitterCallback(_BuildModelSent);

                _NotSkipNonLetterAndNonDigitToTheEnd = false;
            }

            _ProcessSentCallback = null;

            return(_BuildModelWords);
        }
Exemplo n.º 10
0
        private Tokenizer(TokenizerConfig4NerModelBuilder config)
        {
            _UrlDetector = new UrlDetector(new UrlDetectorConfig()
            {
                Model = config.UrlDetectorConfig.Model, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position
            });
            _BuildModelSent        = sent_t.CreateEmpty();
            _Words                 = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _BuildModelWords       = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _ParticleThatExclusion = config.Model.ParticleThatExclusion;

            _UIM  = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;   //UnsafeConst.Inst._CRF_CHARTYPE_MAP;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance;
            _NerInputTypeProcessor       = config.NerInputTypeProcessorFactory.CreateInstance();
        }
Exemplo n.º 11
0
        public mld_tokenizer(UrlDetectorModel urlModel, int wordCapacity)
        {
            var urlConfig = new UrlDetectorConfig()
            {
                Model          = urlModel,
                UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position,
            };

            _UrlDetector         = new UrlDetector(urlConfig);
            _Words               = new List <string>(Math.Max(DEFAULT_WORDCAPACITY, wordCapacity));
            _NgramsSB            = new StringBuilder();
            _AddWordToListAction = new Action <string>(AddWordToList);

            _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _IAW = UnsafeConst.Inst._INTERPRETE_AS_WHITESPACE;
            _DWC = UnsafeConst.Inst._DIGIT_WORD_CHARS;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);
        }
Exemplo n.º 12
0
 static StringsHelper()
 {
     _UPPER_INVARIANT_MAP = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
     _CHARTYPE_MAP        = xlat_Unsafe.Inst._CHARTYPE_MAP;
 }
Exemplo n.º 13
0
 public PosTaggerInputTypeProcessorEn(HashSet <string> numbers, HashSet <string> abbreviations)
 {
     _numbers       = numbers;
     _abbreviations = abbreviations;
     _CTM           = XlatUnsafe.Inst._CHARTYPE_MAP;
 }
Exemplo n.º 14
0
 public PosTaggerInputTypeProcessor_Ru(HashSet <string> numbers, HashSet <string> abbreviations)
 {
     _Numbers       = numbers;
     _Abbreviations = abbreviations;
     _CTM           = xlat_Unsafe.Inst._CHARTYPE_MAP;
 }
Exemplo n.º 15
0
 static MModelMMFBase()
 {
     _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP;
 }
Exemplo n.º 16
0
 static ModelNativeBase()
 {
     _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP;
 }
Exemplo n.º 17
0
 public NerInputTypeProcessor_Ru()
 {
     _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP;
 }
Exemplo n.º 18
0
 static ModelLoader()
 {
     _CHARTYPE_MAP        = XlatUnsafe.Inst._CHARTYPE_MAP;
     _UPPER_INVARIANT_MAP = XlatUnsafe.Inst._UPPER_INVARIANT_MAP;
 }