public bool Find(word_t word, out TreeNode node) { TreeNode transNode; do { if (word.IsWordInNerChain) //---if ( word.Tag == DONT_MERGE_WITH_NAME_ANOTHER ) { node = null; return(false); //goto SKIP_WORD; } transNode = _Node.GetTransition(word.nerOutputType); if (_Node == _Root) { break; } if (transNode == null) { _Node = _Node.Failure; } }while (transNode == null); if (transNode != null) { _Node = transNode; } node = _Node; return(true); }
/* * это на этапе морфо+теггер до снятия неоднозначности * * В случае наличия нескольких вариантов нормализации с разным положением регистра отбор кандидата производить следующим образом: * - если слово написано с [_не_заглавной_] буквы и это часть речи NOUN; ADJECTIVE; ADVERB , то отбирать [_все_] варианты; * - если слово написано с [_не_заглавной_] буквы и это часть речи _не_ NOUN; ADJECTIVE; ADVERB , то отбирать варианты с [_не_заглавной_] буквы; * - если слово написано с [_Заглавной_] буквы и это {_первое_} слово в предложении, то отбирать [_все_] варианты; * - если слово написано с [_Заглавной_] буквы и это часть речи NOUN; ADJECTIVE; ADVERB и {_не_первое_} слово в предложении, то отбирать с [_заглавной_] буквы; * - если слово написано с [_Заглавной_] буквы и это часть речи _не_ NOUN; ADJECTIVE; ADVERB и {_не_первое_} слово в предложении, то отбирать [_все_] варианты; */ #endregion private static WordFormMorphologyModeEnum GetWordFormMorphologyMode(word_t word, int wordindex) { if (wordindex == 0) { return(WordFormMorphologyModeEnum.Default); } if (word.posTaggerFirstCharIsUpper) { switch (word.posTaggerOutputType) { case PosTaggerOutputType.Noun: case PosTaggerOutputType.Adjective: case PosTaggerOutputType.Adverb: return(WordFormMorphologyModeEnum.FirstStartsWithUpperAfterLowerLetter); default: return(WordFormMorphologyModeEnum.Default); } } else { return(WordFormMorphologyModeEnum.FirstStartsWithLowerAfterUpperLetter); } }
public WordMorphoAmbiguity_t( word_t word, MorphoAmbiguityTuple_t.PunctuationTypeEnum punctuationType, List <MorphoAmbiguityTuple_t> morphoAmbiguityTuples) { Word = word; PunctuationType = punctuationType; MorphoAmbiguityTuples = morphoAmbiguityTuples; }
unsafe public MorphoAmbiguityTuple_t( word_t word, WordFormMorphology_t wordFormMorphology, PunctuationTypeEnum punctuationType) { Word = word; WordFormMorphology = wordFormMorphology; PunctuationType = punctuationType; }
unsafe public WordMorphoAmbiguity_t Create(word_t word, int wordIdex) { while (_MorphoAmbiguityTuples_Buffer.Count <= wordIdex) { _MorphoAmbiguityTuples_Buffer.Add(new List <MorphoAmbiguityTuple_t>(DEFAULT_WORDFORMMORPHOLOGY_COUNT)); } var punctuationType = MorphoAmbiguityTuple_t.GetPunctuationType(word); var buffer = _MorphoAmbiguityTuples_Buffer[wordIdex]; buffer.Clear(); buffer.Add(new MorphoAmbiguityTuple_t(word, new WordFormMorphology_t(), punctuationType)); return(new WordMorphoAmbiguity_t(word, punctuationType, buffer)); }
unsafe public WordMorphoAmbiguity_t Create(word_t word, int wordIdex, WordFormMorphology_t[] wordFormMorphologies) { while (_MorphoAmbiguityTuples_Buffer.Count <= wordIdex) { _MorphoAmbiguityTuples_Buffer.Add(new List <MorphoAmbiguityTuple_t>(DEFAULT_WORDFORMMORPHOLOGY_COUNT)); } var punctuationType = MorphoAmbiguityTuple_t.GetPunctuationType(word); var buffer = _MorphoAmbiguityTuples_Buffer[wordIdex]; buffer.Clear(); for (int i = 0, len = wordFormMorphologies.Length; i < len; i++) { buffer.Add(new MorphoAmbiguityTuple_t(word, wordFormMorphologies[i], punctuationType)); } return(new WordMorphoAmbiguity_t(word, punctuationType, buffer)); }
unsafe public static PunctuationTypeEnum GetPunctuationType(word_t word) { if (word.posTaggerOutputType == PosTaggerOutputType.Punctuation) { if (word.nerInputType == NerInputType.Q) { return(PunctuationTypeEnum.PunctuationQuote); } else { fixed(char *_base = word.valueOriginal) { var ct = *(xlat_Unsafe.Inst._CHARTYPE_MAP + *_base); if ((ct & CharType.IsQuote) == CharType.IsQuote) { word.nerInputType = NerInputType.Q; return(PunctuationTypeEnum.PunctuationQuote); } else if ((ct & CharType.IsBracket) == CharType.IsBracket) { return(PunctuationTypeEnum.PunctuationBracket); } else { return(PunctuationTypeEnum.Punctuation); } } } } else { return(PunctuationTypeEnum.__NonPunctuation__); } }
/// <summary> /// /// </summary> unsafe public PosTaggerInputTypeResult GetResult(char *_base, int length, word_t word) //, string valueUpper ) { //-1- int digitsCount = 0, upperCount = 0, lowerCount = 0, hyphenCount = 0, pointCount = 0, romanNumberCount = 0; int firstHyphenIndex = -1; //-2- #region [.main cycle.] for (int i = 0; i < length; i++) { var ch = *(_base + i); var ct = *(_CTM + ch); if ((ct & CharType.IsDigit) == CharType.IsDigit) { digitsCount++; } else if ((ct & CharType.IsLower) == CharType.IsLower) { lowerCount++; } else if ((ct & CharType.IsUpper) == CharType.IsUpper) { upperCount++; if (IsRomanSymbol(ch)) { romanNumberCount++; } } else if ((ct & CharType.IsHyphen) == CharType.IsHyphen) //if ( xlat.IsHyphen( ch ) ) { hyphenCount++; if ((firstHyphenIndex == -1) && (i != 0) && (digitsCount == 0) && (i == lowerCount + upperCount)) { firstHyphenIndex = i; } } else if (xlat.IsDot(ch)) { pointCount++; } } #endregion if (pointCount == 0) { if ((digitsCount == 0) && (0 < romanNumberCount) && ((romanNumberCount == length) || (romanNumberCount == length - hyphenCount))) { return(PosTaggerInputTypeResult.Num); } /*-different-from-russian- * if ( IsLatin( _base, length ) ) * return (PosTaggerInputTypeResult.AllLat); */ } if ((lowerCount == 0) && (upperCount == 0)) { /// цифры в любой комбинации со знаками препинаний без букв - NUM if (digitsCount != 0) { return(PosTaggerInputTypeResult.Num); } var _first_ch = *_base; switch (_first_ch) { // запятая - Com case ',': return(PosTaggerInputTypeResult.Com); // двоеточие - Col case ':': return(PosTaggerInputTypeResult.Col); } var _first_ct = *(_CTM + _first_ch); // дефис - Dush if ((_first_ct & CharType.IsHyphen) == CharType.IsHyphen) //if ( xlat.IsHyphen( _first_ch ) ) { return(PosTaggerInputTypeResult.Dush); } } else if ((digitsCount == 0) && (firstHyphenIndex == -1)) { switch (pointCount) { case 0: if ((hyphenCount == 0) && _Numbers.Contains(word.valueUpper) ) { return(PosTaggerInputTypeResult.CreateNum()); // return (PosTaggerInputTypeResult.Num); } break; case 1: if ((hyphenCount == 0) && //no hyphen (xlat.IsDot(*(_base + length - 1))) && //if dot is last char _Numbers.Contains(word.valueUpper) ) { return(PosTaggerInputTypeResult.CreateNum()); } break; /*---previuos commented---case 1: break;*/ default: //1 < pointCount if ((hyphenCount == 0) && _Abbreviations.Contains(word.valueOriginal) //-регистрозависимый!!!-_Abbreviations.Contains( word.valueUpper ) ) { return(PosTaggerInputTypeResult.IsAbbreviation); } break; } } var first_ch = *_base; var first_ct = *(_CTM + first_ch); var isFirstUpper = (1 < length) && ((first_ct & CharType.IsUpper) == CharType.IsUpper); if (isFirstUpper) { if ((lowerCount != 0) && (0 < upperCount) && (pointCount == 0)) { return(PosTaggerInputTypeResult.FstC); } if (pointCount != 0) { var ch = *(_base + 1); if (xlat.IsDot(ch)) { return(PosTaggerInputTypeResult.OneCP); } } } if ((first_ct & CharType.IsDigit) == CharType.IsDigit) { return(PosTaggerInputTypeResult.Num); } if (firstHyphenIndex != -1) { var firstNumberWord = word.valueUpper.Substring(0, firstHyphenIndex); if (_Numbers.Contains(firstNumberWord)) { var p = LastPositionOfHyphen(_base, length); //---var v = new string( _base, p, length - p ); //original-value var v = word.valueUpper.Substring(p); //upper-case-value return(PosTaggerInputTypeResult.CreateNum(v)); // return (PosTaggerInputTypeResult.Num); } } if ((digitsCount == 0) && (lowerCount == 0) && (upperCount == 0)) { return(PosTaggerInputTypeResult.IsPunctuation); } return(PosTaggerInputTypeResult.O); }
unsafe private bool ReadNextSent(TextReader textReader, ref int lineNumber) { _Words.Clear(); for (var line = textReader.ReadLine(); ; line = textReader.ReadLine()) { if (line == null) { return(false); } lineNumber++; if (string.IsNullOrWhiteSpace(line)) { break; } var a = line.Split(SPLIT_CHARS, StringSplitOptions.RemoveEmptyEntries); if (a.Length < 2) { throw (new InvalidDataException("Wrong input data format. APPROXIMITE-LINE-NUMBER: " + lineNumber + ", line-TEXT: '" + line + '\'')); } var v = a[0].Trim().Replace('ё', 'е').Replace('Ё', 'Е'); var p = ToPosTaggerOutputType(a[2].Trim()); //skip url's var urls = _UrlDetector.AllocateUrls(v); if (urls.Count != 0) { continue; } fixed(char *ptr = v) { var word = new word_t() { valueOriginal = v, valueUpper = v.ToUpperInvariant(), posTaggerOutputType = p, }; var result = _PosTaggerInputTypeProcessor.GetResult(ptr, v.Length, word); //v.ToUpperInvariant() ); word.posTaggerInputType = result.posTaggerInputType; word.posTaggerExtraWordType = result.posTaggerExtraWordType; if ((word.posTaggerExtraWordType == PosTaggerExtraWordType.__DEFAULT__) && (word.posTaggerInputType == PosTaggerInputType.O)) { if (word.valueOriginal.Contains(' ')) { word.posTaggerInputType = PosTaggerInputType.CompPh; } #region [.if process url's.] /*else * { * var urls = _UrlDetector.AllocateUrls( v ); * if ( urls.Count != 0 ) * { * word.posTaggerInputType = PosTaggerInputType.Url; * word.posTaggerOutputType = PosTaggerOutputType.Other; * } * }*/ #endregion } _Words.Add(word); } } return(true); }
/* * морфоанализатор::{PartOfSpeechEnum} PoS-tagger::{PosTaggerOutputType} * PartOfSpeechEnum.Adjective PosTaggerOutputType.Adjective * PosTaggerOutputType.AdjectivePronoun * PartOfSpeechEnum.Adverb PosTaggerOutputType.Adverb * PosTaggerOutputType.AdverbialPronoun * PartOfSpeechEnum.Article PosTaggerOutputType.Article * PartOfSpeechEnum.Conjunction PosTaggerOutputType.Conjunction * PartOfSpeechEnum.Interjection PosTaggerOutputType.Interjection * PartOfSpeechEnum.Noun PosTaggerOutputType.Noun * PartOfSpeechEnum.Numeral PosTaggerOutputType.Numeral * PartOfSpeechEnum.Other PosTaggerOutputType.Other * PartOfSpeechEnum.Particle PosTaggerOutputType.Particle * PartOfSpeechEnum.Predicate PosTaggerOutputType.Predicate * PartOfSpeechEnum.Preposition PosTaggerOutputType.Preposition * PartOfSpeechEnum.Pronoun PosTaggerOutputType.Pronoun * PosTaggerOutputType.PossessivePronoun * PosTaggerOutputType.AdjectivePronoun * PosTaggerOutputType.AdverbialPronoun * PartOfSpeechEnum.Verb PosTaggerOutputType.Verb * PosTaggerOutputType.Infinitive * PosTaggerOutputType.AdverbialParticiple * PosTaggerOutputType.AuxiliaryVerb * PosTaggerOutputType.Participle * - PosTaggerOutputType.Punctuation */ #endregion private static void CorrectPosTaggerOutputType(word_t word, PartOfSpeechEnum singlePartOfSpeech) { switch (singlePartOfSpeech) { case PartOfSpeechEnum.Adjective: switch (word.posTaggerOutputType) { case PosTaggerOutputType.Adjective: case PosTaggerOutputType.AdjectivePronoun: break; default: word.posTaggerOutputType = PosTaggerOutputType.Adjective; break; } break; case PartOfSpeechEnum.Adverb: switch (word.posTaggerOutputType) { case PosTaggerOutputType.Adverb: case PosTaggerOutputType.AdverbialPronoun: break; default: word.posTaggerOutputType = PosTaggerOutputType.Adverb; break; } break; case PartOfSpeechEnum.Article: word.posTaggerOutputType = PosTaggerOutputType.Article; break; case PartOfSpeechEnum.Conjunction: word.posTaggerOutputType = PosTaggerOutputType.Conjunction; break; case PartOfSpeechEnum.Interjection: word.posTaggerOutputType = PosTaggerOutputType.Interjection; break; case PartOfSpeechEnum.Noun: word.posTaggerOutputType = PosTaggerOutputType.Noun; break; case PartOfSpeechEnum.Numeral: word.posTaggerOutputType = PosTaggerOutputType.Numeral; break; case PartOfSpeechEnum.Other: word.posTaggerOutputType = PosTaggerOutputType.Other; break; case PartOfSpeechEnum.Particle: word.posTaggerOutputType = PosTaggerOutputType.Particle; break; case PartOfSpeechEnum.Predicate: word.posTaggerOutputType = PosTaggerOutputType.Predicate; break; case PartOfSpeechEnum.Preposition: word.posTaggerOutputType = PosTaggerOutputType.Preposition; break; case PartOfSpeechEnum.Pronoun: switch (word.posTaggerOutputType) { case PosTaggerOutputType.Pronoun: case PosTaggerOutputType.PossessivePronoun: case PosTaggerOutputType.AdjectivePronoun: case PosTaggerOutputType.AdverbialPronoun: break; default: word.posTaggerOutputType = PosTaggerOutputType.Pronoun; break; } break; case PartOfSpeechEnum.Verb: switch (word.posTaggerOutputType) { case PosTaggerOutputType.Verb: case PosTaggerOutputType.Infinitive: case PosTaggerOutputType.AdverbialParticiple: case PosTaggerOutputType.AuxiliaryVerb: case PosTaggerOutputType.Participle: break; default: word.posTaggerOutputType = PosTaggerOutputType.Verb; break; } break; default: throw (new ArgumentException(singlePartOfSpeech.ToString())); } }
public unsafe PosTaggerInputTypeResult GetResult(char *_base, int length, word_t word) //, string valueUpper ) { return(PosTaggerInputTypeResult.O); }