private char *_ptr; //current pointer into text #region ctor dtor dispose public UrlDetector(UrlDetectorConfig config) { _extractValue = (config.UrlExtractMode == UrlExtractModeEnum.ValueAndPosition); _firstLevelDomains = config.Model.FirstLevelDomains; _firstLevelDomainsMaxLength = config.Model.FirstLevelDomainsMaxLength; _URIschemes = config.Model.URIschemes; _URIschemesMaxLength = config.Model.URIschemesMaxLength; _urls = new List <Url>(DEFAULT_LIST_CAPACITY); _stringBuilder = new StringBuilder(); _url = new Url(); _urlStructs = new List <UrlStruct>(DEFAULT_LIST_CAPACITY); _CTM = XlatUnsafe.Inst._CHARTYPE_MAP; _UIM = XlatUnsafe.Inst._UPPER_INVARIANT_MAP; _firstLevelDomainBuffer = new char[_firstLevelDomainsMaxLength + 1]; _firstLevelDomainBufferGCHandle = GCHandle.Alloc(_firstLevelDomainBuffer, GCHandleType.Pinned); _fldBufferPtrBase = (char *)_firstLevelDomainBufferGCHandle.AddrOfPinnedObject().ToPointer(); _URIschemesBuffer = new char[_URIschemesMaxLength + 1]; _URIschemesBufferGCHandle = GCHandle.Alloc(_URIschemesBuffer, GCHandleType.Pinned); _uriSchBufferPtrBase = (char *)_URIschemesBufferGCHandle.AddrOfPinnedObject().ToPointer(); }
public Tokenizer(TokenizerConfig config) { _SentSplitter = new SentSplitter(config.SentSplitterConfig); _Words = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY); _ParticleThatExclusion = config.Model.ParticleThatExclusion; _SentSplitterProcessSentCallback_Delegate = new SentSplitter.ProcessSentCallbackDelegate(SentSplitterProcessSentCallback); _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; //UnsafeConst.Inst._CRF_CHARTYPE_MAP; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); if ((config.TokenizeMode & TokenizeMode.PosTagger) == TokenizeMode.PosTagger) { _PosTaggerInputTypeProcessor = config.PosTaggerInputTypeProcessorFactory.CreateInstance(); //_Make_PosTagger = true; } else { _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance; } if ((config.TokenizeMode & TokenizeMode.Ner) == TokenizeMode.Ner) { _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); //_Make_Ner = true; } else { _NerInputTypeProcessor = Dummy_NerInputTypeProcessor.Instance; } }
unsafe private static string NormalizeTerm(CharType *ctm, char *uim, string term) { var len_minus_1 = term.Length - 1; fixed(char *ptr = term) { var start = 0; for ( ; start <= len_minus_1; start++) { //if ( char.IsLetter( *(ptr + start) ) ) //if ( (xlat.CHARTYPE_MAP[ *(ptr + start) ] & CharType.IsLetter) == CharType.IsLetter ) if ((*(ctm + *(ptr + start)) & CharType.IsLetter) == CharType.IsLetter) { break; } } var end = len_minus_1; for ( ; start < end; end--) { //if ( char.IsLetter( *(ptr + end) ) ) //if ( (xlat.CHARTYPE_MAP[ *(ptr + end) ] & CharType.IsLetter) == CharType.IsLetter ) if ((*(ctm + *(ptr + end)) & CharType.IsLetter) == CharType.IsLetter) { break; } } if (start != 0 || end != len_minus_1) { if (end <= start) { return(null); } for (var i = start; i <= end; i++) { //*(ptr + i) = char.ToUpperInvariant( *(ptr + i) ); //*(ptr + i) = xlat.UPPER_INVARIANT_MAP[ *(ptr + i) ]; *(ptr + i) = *(uim + *(ptr + i)); } var normTerm = new string( ptr, start, end - start + 1 ); return(normTerm); } else { for (var i = 0; i <= len_minus_1; i++) { //*(ptr + i) = char.ToUpperInvariant( *(ptr + i) ); //*(ptr + i) = xlat.UPPER_INVARIANT_MAP[ *(ptr + i) ]; *(ptr + i) = *(uim + *(ptr + i)); } return(term); } } }
public PosTaggerMorphoAnalyzer(IMorphoModel morphoModel, MorphoAmbiguityResolverModel morphoAmbiguityModel) { _morphoModel = morphoModel; _morphoAnalyzer = new MorphoAnalyzer(_morphoModel); _morphoAmbiguityPreProcessor = new MorphoAmbiguityPreProcessor(); _morphoAmbiguityResolver = new MorphoAmbiguityResolver(morphoAmbiguityModel); _wordFormMorphologies_Buffer = new List <WordFormMorphology>(); _wordMorphoAmbiguityFactory = new WordMorphoAmbiguityFactory(null); _wordMorphoAmbiguities = new List <WordMorphoAmbiguity>(); CTM = XlatUnsafe.Inst._CHARTYPE_MAP; }
private xlat_Unsafe() { //string POSINPUTTYPE_OTHER = PosTaggerInputType.O.ToText(); //string NERINPUTTYPE_OTHER = NerInputType.O.ToText(); //-1- var inputtypeOtherBytes = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER); var inputtypeOtherBytesGCHandle = GCHandle.Alloc(inputtypeOtherBytes, GCHandleType.Pinned); _InputtypeOtherPtrBase = (byte *)inputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer(); //-1- var posInputtypeOtherBytes = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER /*POSINPUTTYPE_OTHER*/); var posInputtypeOtherBytesGCHandle = GCHandle.Alloc(posInputtypeOtherBytes, GCHandleType.Pinned); _PosInputtypeOtherPtrBase = (byte *)posInputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer(); //-1- var nerInputtypeOtherBytes = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER /*NERINPUTTYPE_OTHER*/); var nerInputtypeOtherBytesGCHandle = GCHandle.Alloc(nerInputtypeOtherBytes, GCHandleType.Pinned); _NerInputtypeOtherPtrBase = (byte *)nerInputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer(); //-2- var beginOfSentenceBytes = Encoding.UTF8.GetBytes(BEGIN_OF_SENTENCE); var beginOfSentenceBytesGCHandle = GCHandle.Alloc(beginOfSentenceBytes, GCHandleType.Pinned); _BeginOfSentencePtrBase = (byte *)beginOfSentenceBytesGCHandle.AddrOfPinnedObject().ToPointer(); //-3- var endOfSentenceBytes = Encoding.UTF8.GetBytes(END_OF_SENTENCE); var endOfSentenceBytesGCHandle = GCHandle.Alloc(endOfSentenceBytes, GCHandleType.Pinned); _EndOfSentencePtrBase = (byte *)endOfSentenceBytesGCHandle.AddrOfPinnedObject().ToPointer(); #if XLAT_CHARTYPE_MAP //-4- var x = new ushort[xlat.CHARTYPE_MAP.Length]; for (int i = 0; i < x.Length; i++) { x[i] = (ushort)xlat.CHARTYPE_MAP[i]; } var ctmGCHandle = GCHandle.Alloc(x /*xlat.CHARTYPE_MAP*/, GCHandleType.Pinned); _CHARTYPE_MAP = (CharType *)ctmGCHandle.AddrOfPinnedObject().ToPointer(); //_CHARTYPE_MAP = (CharType*) Marshal.UnsafeAddrOfPinnedArrayElement( xlat.CHARTYPE_MAP, 0 ).ToPointer(); #endif #if XLAT_UPPER_INVARIANT_MAP //-5- var uimGCHandle = GCHandle.Alloc(xlat.UPPER_INVARIANT_MAP, GCHandleType.Pinned); _UPPER_INVARIANT_MAP = (char *)uimGCHandle.AddrOfPinnedObject().ToPointer(); //---_UPPER_INVARIANT_MAP = (char*) Marshal.UnsafeAddrOfPinnedArrayElement( UPPER_INVARIANT_MAP, 0 ).ToPointer(); #endif }
private xlat_Unsafe() { //-1- var inputtypeOtherBytes = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER); var inputtypeOtherBytesGCHandle = GCHandle.Alloc(inputtypeOtherBytes, GCHandleType.Pinned); _InputtypeOtherPtrBase = (byte *)inputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer(); //-1- var posInputtypeOtherBytes = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER /*POSINPUTTYPE_OTHER*/); var posInputtypeOtherBytesGCHandle = GCHandle.Alloc(posInputtypeOtherBytes, GCHandleType.Pinned); _PosInputtypeOtherPtrBase = (byte *)posInputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer(); //-1- var nerInputtypeOtherBytes = Encoding.UTF8.GetBytes(INPUTTYPE_OTHER /*NERINPUTTYPE_OTHER*/); var nerInputtypeOtherBytesGCHandle = GCHandle.Alloc(nerInputtypeOtherBytes, GCHandleType.Pinned); _NerInputtypeOtherPtrBase = (byte *)nerInputtypeOtherBytesGCHandle.AddrOfPinnedObject().ToPointer(); //-2- var beginOfSentenceBytes = Encoding.UTF8.GetBytes(BEGIN_OF_SENTENCE); var beginOfSentenceBytesGCHandle = GCHandle.Alloc(beginOfSentenceBytes, GCHandleType.Pinned); _BeginOfSentencePtrBase = (byte *)beginOfSentenceBytesGCHandle.AddrOfPinnedObject().ToPointer(); //-3- var endOfSentenceBytes = Encoding.UTF8.GetBytes(END_OF_SENTENCE); var endOfSentenceBytesGCHandle = GCHandle.Alloc(endOfSentenceBytes, GCHandleType.Pinned); _EndOfSentencePtrBase = (byte *)endOfSentenceBytesGCHandle.AddrOfPinnedObject().ToPointer(); #if XLAT_CHARTYPE_MAP //-4- var _x_ = new ushort[xlat.CHARTYPE_MAP.Length]; for (int i = 0; i < _x_.Length; i++) { _x_[i] = (ushort)xlat.CHARTYPE_MAP[i]; } var ctmGCHandle = GCHandle.Alloc(_x_ /*xlat.CHARTYPE_MAP*/, GCHandleType.Pinned); _CHARTYPE_MAP = (CharType *)ctmGCHandle.AddrOfPinnedObject().ToPointer(); #endif #if XLAT_UPPER_INVARIANT_MAP //-5- var uimGCHandle = GCHandle.Alloc(xlat.UPPER_INVARIANT_MAP, GCHandleType.Pinned); _UPPER_INVARIANT_MAP = (char *)uimGCHandle.AddrOfPinnedObject().ToPointer(); #endif #if XLAT_LOWER_INVARIANT_MAP //-5- var limGCHandle = GCHandle.Alloc(xlat.LOWER_INVARIANT_MAP, GCHandleType.Pinned); _LOWER_INVARIANT_MAP = (char *)limGCHandle.AddrOfPinnedObject().ToPointer(); #endif }
public void run(string text, bool splitBySmiles, ProcessSentCallbackDelegate processSentCallback) { _ProcessSentCallback = processSentCallback; fixed(char *_base = text) fixed(CharType * ctm = xlat.CHARTYPE_MAP) fixed(NERCharType * nctm = NER_CHARTYPE_MAP) { _BASE = _base; _CTM = ctm; _NCTM = nctm; _SentSplitter.SplitBySmiles = splitBySmiles; _SentSplitter.AllocateSents(text, ProcessSentSplitterCallback); } _ProcessSentCallback = null; }
private Tokenizer(TokenizerConfig4NerModelBuilder config) { config.UrlDetectorConfig.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position; _urlDetector = new UrlDetector(config.UrlDetectorConfig); _buildModelSentence = Sentence.CreateEmpty(); _words = new List <Word>(DEFAULT_WORDSLIST_CAPACITY); _buildModelWords = new List <Buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY); _particleThatExclusion = config.Model.ParticleThatExclusion; _UIM = XlatUnsafe.Inst._UPPER_INVARIANT_MAP; _CTM = XlatUnsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); _posTaggerInputTypeProcessor = DummyPosTaggerInputTypeProcessor.Instance; _nerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); }
public List <buildmodel_word_t> run4ModelBuilder( string partOfSentText, bool isLastPartOfSentText, NerOutputType nerOutputType, bool prevPartOfSentTextSameNerOutputType) { _BuildModelWords.Clear(); if (prevPartOfSentTextSameNerOutputType) { _BuildModelNerInputTypeI = nerOutputType.ToBuildModelNerInputTypeI(); _BuildModelNerInputTypeB = _BuildModelNerInputTypeI; } else { _BuildModelNerInputTypeI = nerOutputType.ToBuildModelNerInputTypeI(); _BuildModelNerInputTypeB = nerOutputType.ToBuildModelNerInputTypeB(); } _ProcessSentCallback = ProcessSentCallbackModelBuilder; fixed(char *_base = partOfSentText) fixed(CharType * ctm = xlat.CHARTYPE_MAP) fixed(NERCharType * nctm = NER_CHARTYPE_MAP) { _BASE = _base; _CTM = ctm; _NCTM = nctm; _NotSkipNonLetterAndNonDigitToTheEnd = !isLastPartOfSentText; var urls = _UrlDetector.AllocateUrls(partOfSentText); _BuildModelSent.Set4ModelBuilder(0, partOfSentText.Length, (0 < urls.Count) ? urls : null); ProcessSentSplitterCallback(_BuildModelSent); _NotSkipNonLetterAndNonDigitToTheEnd = false; } _ProcessSentCallback = null; return(_BuildModelWords); }
private Tokenizer(TokenizerConfig4NerModelBuilder config) { _UrlDetector = new UrlDetector(new UrlDetectorConfig() { Model = config.UrlDetectorConfig.Model, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position }); _BuildModelSent = sent_t.CreateEmpty(); _Words = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY); _BuildModelWords = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY); _ParticleThatExclusion = config.Model.ParticleThatExclusion; _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; //UnsafeConst.Inst._CRF_CHARTYPE_MAP; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance; _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); }
public mld_tokenizer(UrlDetectorModel urlModel, int wordCapacity) { var urlConfig = new UrlDetectorConfig() { Model = urlModel, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position, }; _UrlDetector = new UrlDetector(urlConfig); _Words = new List <string>(Math.Max(DEFAULT_WORDCAPACITY, wordCapacity)); _NgramsSB = new StringBuilder(); _AddWordToListAction = new Action <string>(AddWordToList); _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _IAW = UnsafeConst.Inst._INTERPRETE_AS_WHITESPACE; _DWC = UnsafeConst.Inst._DIGIT_WORD_CHARS; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); }
static StringsHelper() { _UPPER_INVARIANT_MAP = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CHARTYPE_MAP = xlat_Unsafe.Inst._CHARTYPE_MAP; }
public PosTaggerInputTypeProcessorEn(HashSet <string> numbers, HashSet <string> abbreviations) { _numbers = numbers; _abbreviations = abbreviations; _CTM = XlatUnsafe.Inst._CHARTYPE_MAP; }
public PosTaggerInputTypeProcessor_Ru(HashSet <string> numbers, HashSet <string> abbreviations) { _Numbers = numbers; _Abbreviations = abbreviations; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; }
static MModelMMFBase() { _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; }
static ModelNativeBase() { _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; }
public NerInputTypeProcessor_Ru() { _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; }
static ModelLoader() { _CHARTYPE_MAP = XlatUnsafe.Inst._CHARTYPE_MAP; _UPPER_INVARIANT_MAP = XlatUnsafe.Inst._UPPER_INVARIANT_MAP; }