Example #1
0
            public bool Find(word_t word, out TreeNode node)
            {
                TreeNode transNode;

                do
                {
                    if (word.IsWordInNerChain)   //---if ( word.Tag == DONT_MERGE_WITH_NAME_ANOTHER )
                    {
                        node = null;
                        return(false);  //goto SKIP_WORD;
                    }
                    transNode = _Node.GetTransition(word.nerOutputType);
                    if (_Node == _Root)
                    {
                        break;
                    }
                    if (transNode == null)
                    {
                        _Node = _Node.Failure;
                    }
                }while (transNode == null);
                if (transNode != null)
                {
                    _Node = transNode;
                }
                node = _Node;
                return(true);
            }
        /*
         * это на этапе морфо+теггер до снятия неоднозначности
         *
         * В случае наличия нескольких вариантов нормализации с разным положением регистра отбор кандидата  производить следующим образом:
         * - если слово написано с [_не_заглавной_] буквы и это часть речи      NOUN; ADJECTIVE; ADVERB , то отбирать [_все_] варианты;
         * - если слово написано с [_не_заглавной_] буквы и это часть речи _не_ NOUN; ADJECTIVE; ADVERB , то отбирать варианты с [_не_заглавной_] буквы;
         * - если слово написано с [_Заглавной_]    буквы и это {_первое_} слово в предложении, то отбирать [_все_] варианты;
         * - если слово написано с [_Заглавной_]    буквы и это часть речи      NOUN; ADJECTIVE; ADVERB и {_не_первое_} слово в предложении, то отбирать с [_заглавной_] буквы;
         * - если слово написано с [_Заглавной_]    буквы и это часть речи _не_ NOUN; ADJECTIVE; ADVERB и {_не_первое_} слово в предложении, то отбирать [_все_] варианты;
         */
        #endregion
        private static WordFormMorphologyModeEnum GetWordFormMorphologyMode(word_t word, int wordindex)
        {
            if (wordindex == 0)
            {
                return(WordFormMorphologyModeEnum.Default);
            }

            if (word.posTaggerFirstCharIsUpper)
            {
                switch (word.posTaggerOutputType)
                {
                case PosTaggerOutputType.Noun:
                case PosTaggerOutputType.Adjective:
                case PosTaggerOutputType.Adverb:
                    return(WordFormMorphologyModeEnum.FirstStartsWithUpperAfterLowerLetter);

                default:
                    return(WordFormMorphologyModeEnum.Default);
                }
            }
            else
            {
                return(WordFormMorphologyModeEnum.FirstStartsWithLowerAfterUpperLetter);
            }
        }
 public WordMorphoAmbiguity_t(
     word_t word,
     MorphoAmbiguityTuple_t.PunctuationTypeEnum punctuationType,
     List <MorphoAmbiguityTuple_t> morphoAmbiguityTuples)
 {
     Word                  = word;
     PunctuationType       = punctuationType;
     MorphoAmbiguityTuples = morphoAmbiguityTuples;
 }
 unsafe public MorphoAmbiguityTuple_t(
     word_t word,
     WordFormMorphology_t wordFormMorphology,
     PunctuationTypeEnum punctuationType)
 {
     Word = word;
     WordFormMorphology = wordFormMorphology;
     PunctuationType    = punctuationType;
 }
        unsafe public WordMorphoAmbiguity_t Create(word_t word, int wordIdex)
        {
            while (_MorphoAmbiguityTuples_Buffer.Count <= wordIdex)
            {
                _MorphoAmbiguityTuples_Buffer.Add(new List <MorphoAmbiguityTuple_t>(DEFAULT_WORDFORMMORPHOLOGY_COUNT));
            }

            var punctuationType = MorphoAmbiguityTuple_t.GetPunctuationType(word);
            var buffer          = _MorphoAmbiguityTuples_Buffer[wordIdex];

            buffer.Clear();
            buffer.Add(new MorphoAmbiguityTuple_t(word, new WordFormMorphology_t(), punctuationType));

            return(new WordMorphoAmbiguity_t(word, punctuationType, buffer));
        }
        unsafe public WordMorphoAmbiguity_t Create(word_t word, int wordIdex, WordFormMorphology_t[] wordFormMorphologies)
        {
            while (_MorphoAmbiguityTuples_Buffer.Count <= wordIdex)
            {
                _MorphoAmbiguityTuples_Buffer.Add(new List <MorphoAmbiguityTuple_t>(DEFAULT_WORDFORMMORPHOLOGY_COUNT));
            }

            var punctuationType = MorphoAmbiguityTuple_t.GetPunctuationType(word);
            var buffer          = _MorphoAmbiguityTuples_Buffer[wordIdex];

            buffer.Clear();
            for (int i = 0, len = wordFormMorphologies.Length; i < len; i++)
            {
                buffer.Add(new MorphoAmbiguityTuple_t(word, wordFormMorphologies[i], punctuationType));
            }
            return(new WordMorphoAmbiguity_t(word, punctuationType, buffer));
        }
        unsafe public static PunctuationTypeEnum GetPunctuationType(word_t word)
        {
            if (word.posTaggerOutputType == PosTaggerOutputType.Punctuation)
            {
                if (word.nerInputType == NerInputType.Q)
                {
                    return(PunctuationTypeEnum.PunctuationQuote);
                }
                else
                {
                    fixed(char *_base = word.valueOriginal)
                    {
                        var ct = *(xlat_Unsafe.Inst._CHARTYPE_MAP + *_base);

                        if ((ct & CharType.IsQuote) == CharType.IsQuote)
                        {
                            word.nerInputType = NerInputType.Q;
                            return(PunctuationTypeEnum.PunctuationQuote);
                        }
                        else
                        if ((ct & CharType.IsBracket) == CharType.IsBracket)
                        {
                            return(PunctuationTypeEnum.PunctuationBracket);
                        }
                        else
                        {
                            return(PunctuationTypeEnum.Punctuation);
                        }
                    }
                }
            }
            else
            {
                return(PunctuationTypeEnum.__NonPunctuation__);
            }
        }
Example #8
0
        /// <summary>
        ///
        /// </summary>
        unsafe public PosTaggerInputTypeResult GetResult(char *_base, int length, word_t word)   //, string valueUpper )
        {
            //-1-
            int digitsCount      = 0,
                upperCount       = 0,
                lowerCount       = 0,
                hyphenCount      = 0,
                pointCount       = 0,
                romanNumberCount = 0;
            int firstHyphenIndex = -1;

            //-2-
            #region [.main cycle.]
            for (int i = 0; i < length; i++)
            {
                var ch = *(_base + i);
                var ct = *(_CTM + ch);
                if ((ct & CharType.IsDigit) == CharType.IsDigit)
                {
                    digitsCount++;
                }
                else if ((ct & CharType.IsLower) == CharType.IsLower)
                {
                    lowerCount++;
                }
                else if ((ct & CharType.IsUpper) == CharType.IsUpper)
                {
                    upperCount++;
                    if (IsRomanSymbol(ch))
                    {
                        romanNumberCount++;
                    }
                }
                else if ((ct & CharType.IsHyphen) == CharType.IsHyphen)   //if ( xlat.IsHyphen( ch ) )
                {
                    hyphenCount++;

                    if ((firstHyphenIndex == -1) && (i != 0) && (digitsCount == 0) && (i == lowerCount + upperCount))
                    {
                        firstHyphenIndex = i;
                    }
                }
                else if (xlat.IsDot(ch))
                {
                    pointCount++;
                }
            }
            #endregion

            if (pointCount == 0)
            {
                if ((digitsCount == 0) && (0 < romanNumberCount) && ((romanNumberCount == length) || (romanNumberCount == length - hyphenCount)))
                {
                    return(PosTaggerInputTypeResult.Num);
                }

                /*-different-from-russian-
                 * if ( IsLatin( _base, length ) )
                 *  return (PosTaggerInputTypeResult.AllLat);
                 */
            }


            if ((lowerCount == 0) && (upperCount == 0))
            {
                /// цифры в любой комбинации со знаками препинаний без букв - NUM
                if (digitsCount != 0)
                {
                    return(PosTaggerInputTypeResult.Num);
                }

                var _first_ch = *_base;
                switch (_first_ch)
                {
                // запятая - Com
                case ',': return(PosTaggerInputTypeResult.Com);

                // двоеточие - Col
                case ':': return(PosTaggerInputTypeResult.Col);
                }

                var _first_ct = *(_CTM + _first_ch);
                // дефис - Dush
                if ((_first_ct & CharType.IsHyphen) == CharType.IsHyphen)   //if ( xlat.IsHyphen( _first_ch ) )
                {
                    return(PosTaggerInputTypeResult.Dush);
                }
            }
            else
            if ((digitsCount == 0) && (firstHyphenIndex == -1))
            {
                switch (pointCount)
                {
                case 0:
                    if ((hyphenCount == 0) &&
                        _Numbers.Contains(word.valueUpper)
                        )
                    {
                        return(PosTaggerInputTypeResult.CreateNum());      // return (PosTaggerInputTypeResult.Num);
                    }
                    break;

                case 1:
                    if ((hyphenCount == 0) &&                         //no hyphen
                        (xlat.IsDot(*(_base + length - 1))) &&        //if dot is last char
                        _Numbers.Contains(word.valueUpper)
                        )
                    {
                        return(PosTaggerInputTypeResult.CreateNum());
                    }
                    break;
                /*---previuos commented---case 1: break;*/

                default:                                            //1 < pointCount
                    if ((hyphenCount == 0) &&
                        _Abbreviations.Contains(word.valueOriginal) //-регистрозависимый!!!-_Abbreviations.Contains( word.valueUpper )
                        )
                    {
                        return(PosTaggerInputTypeResult.IsAbbreviation);
                    }
                    break;
                }
            }


            var first_ch = *_base;
            var first_ct = *(_CTM + first_ch);

            var isFirstUpper = (1 < length) && ((first_ct & CharType.IsUpper) == CharType.IsUpper);
            if (isFirstUpper)
            {
                if ((lowerCount != 0) && (0 < upperCount) && (pointCount == 0))
                {
                    return(PosTaggerInputTypeResult.FstC);
                }

                if (pointCount != 0)
                {
                    var ch = *(_base + 1);
                    if (xlat.IsDot(ch))
                    {
                        return(PosTaggerInputTypeResult.OneCP);
                    }
                }
            }


            if ((first_ct & CharType.IsDigit) == CharType.IsDigit)
            {
                return(PosTaggerInputTypeResult.Num);
            }

            if (firstHyphenIndex != -1)
            {
                var firstNumberWord = word.valueUpper.Substring(0, firstHyphenIndex);
                if (_Numbers.Contains(firstNumberWord))
                {
                    var p = LastPositionOfHyphen(_base, length);
                    //---var v = new string( _base, p, length - p ); //original-value
                    var v = word.valueUpper.Substring(p);            //upper-case-value
                    return(PosTaggerInputTypeResult.CreateNum(v));   // return (PosTaggerInputTypeResult.Num);
                }
            }

            if ((digitsCount == 0) && (lowerCount == 0) && (upperCount == 0))
            {
                return(PosTaggerInputTypeResult.IsPunctuation);
            }

            return(PosTaggerInputTypeResult.O);
        }
Example #9
0
        unsafe private bool ReadNextSent(TextReader textReader, ref int lineNumber)
        {
            _Words.Clear();

            for (var line = textReader.ReadLine(); ; line = textReader.ReadLine())
            {
                if (line == null)
                {
                    return(false);
                }

                lineNumber++;

                if (string.IsNullOrWhiteSpace(line))
                {
                    break;
                }

                var a = line.Split(SPLIT_CHARS, StringSplitOptions.RemoveEmptyEntries);
                if (a.Length < 2)
                {
                    throw (new InvalidDataException("Wrong input data format. APPROXIMITE-LINE-NUMBER: " + lineNumber + ", line-TEXT: '" + line + '\''));
                }

                var v = a[0].Trim().Replace('ё', 'е').Replace('Ё', 'Е');
                var p = ToPosTaggerOutputType(a[2].Trim());

                //skip url's
                var urls = _UrlDetector.AllocateUrls(v);
                if (urls.Count != 0)
                {
                    continue;
                }

                fixed(char *ptr = v)
                {
                    var word = new word_t()
                    {
                        valueOriginal       = v,
                        valueUpper          = v.ToUpperInvariant(),
                        posTaggerOutputType = p,
                    };
                    var result = _PosTaggerInputTypeProcessor.GetResult(ptr, v.Length, word);   //v.ToUpperInvariant() );

                    word.posTaggerInputType     = result.posTaggerInputType;
                    word.posTaggerExtraWordType = result.posTaggerExtraWordType;

                    if ((word.posTaggerExtraWordType == PosTaggerExtraWordType.__DEFAULT__) && (word.posTaggerInputType == PosTaggerInputType.O))
                    {
                        if (word.valueOriginal.Contains(' '))
                        {
                            word.posTaggerInputType = PosTaggerInputType.CompPh;
                        }
                        #region [.if process url's.]

                        /*else
                         * {
                         *  var urls = _UrlDetector.AllocateUrls( v );
                         *  if ( urls.Count != 0 )
                         *  {
                         *      word.posTaggerInputType  = PosTaggerInputType.Url;
                         *      word.posTaggerOutputType = PosTaggerOutputType.Other;
                         *  }
                         * }*/
                        #endregion
                    }

                    _Words.Add(word);
                }
            }

            return(true);
        }
        /*
         * морфоанализатор::{PartOfSpeechEnum}  PoS-tagger::{PosTaggerOutputType}
         * PartOfSpeechEnum.Adjective	    PosTaggerOutputType.Adjective
         *                              PosTaggerOutputType.AdjectivePronoun
         * PartOfSpeechEnum.Adverb	        PosTaggerOutputType.Adverb
         *                              PosTaggerOutputType.AdverbialPronoun
         * PartOfSpeechEnum.Article	    PosTaggerOutputType.Article
         * PartOfSpeechEnum.Conjunction	PosTaggerOutputType.Conjunction
         * PartOfSpeechEnum.Interjection	PosTaggerOutputType.Interjection
         * PartOfSpeechEnum.Noun	        PosTaggerOutputType.Noun
         * PartOfSpeechEnum.Numeral	    PosTaggerOutputType.Numeral
         * PartOfSpeechEnum.Other	        PosTaggerOutputType.Other
         * PartOfSpeechEnum.Particle	    PosTaggerOutputType.Particle
         * PartOfSpeechEnum.Predicate	    PosTaggerOutputType.Predicate
         * PartOfSpeechEnum.Preposition	PosTaggerOutputType.Preposition
         * PartOfSpeechEnum.Pronoun	    PosTaggerOutputType.Pronoun 
         *                                  PosTaggerOutputType.PossessivePronoun
         *                                  PosTaggerOutputType.AdjectivePronoun  
         *                                  PosTaggerOutputType.AdverbialPronoun
         * PartOfSpeechEnum.Verb	        PosTaggerOutputType.Verb
         *                                  PosTaggerOutputType.Infinitive
         *                                  PosTaggerOutputType.AdverbialParticiple
         *                                  PosTaggerOutputType.AuxiliaryVerb
         *                                  PosTaggerOutputType.Participle
         * -	                            PosTaggerOutputType.Punctuation
         */
        #endregion

        private static void CorrectPosTaggerOutputType(word_t word, PartOfSpeechEnum singlePartOfSpeech)
        {
            switch (singlePartOfSpeech)
            {
            case PartOfSpeechEnum.Adjective:
                switch (word.posTaggerOutputType)
                {
                case PosTaggerOutputType.Adjective:
                case PosTaggerOutputType.AdjectivePronoun:
                    break;

                default:
                    word.posTaggerOutputType = PosTaggerOutputType.Adjective;
                    break;
                }
                break;

            case PartOfSpeechEnum.Adverb:
                switch (word.posTaggerOutputType)
                {
                case PosTaggerOutputType.Adverb:
                case PosTaggerOutputType.AdverbialPronoun:
                    break;

                default:
                    word.posTaggerOutputType = PosTaggerOutputType.Adverb;
                    break;
                }
                break;

            case PartOfSpeechEnum.Article: word.posTaggerOutputType = PosTaggerOutputType.Article;      break;

            case PartOfSpeechEnum.Conjunction: word.posTaggerOutputType = PosTaggerOutputType.Conjunction;  break;

            case PartOfSpeechEnum.Interjection: word.posTaggerOutputType = PosTaggerOutputType.Interjection; break;

            case PartOfSpeechEnum.Noun: word.posTaggerOutputType = PosTaggerOutputType.Noun;         break;

            case PartOfSpeechEnum.Numeral: word.posTaggerOutputType = PosTaggerOutputType.Numeral;      break;

            case PartOfSpeechEnum.Other: word.posTaggerOutputType = PosTaggerOutputType.Other;        break;

            case PartOfSpeechEnum.Particle: word.posTaggerOutputType = PosTaggerOutputType.Particle;     break;

            case PartOfSpeechEnum.Predicate: word.posTaggerOutputType = PosTaggerOutputType.Predicate;    break;

            case PartOfSpeechEnum.Preposition: word.posTaggerOutputType = PosTaggerOutputType.Preposition;  break;

            case PartOfSpeechEnum.Pronoun:
                switch (word.posTaggerOutputType)
                {
                case PosTaggerOutputType.Pronoun:
                case PosTaggerOutputType.PossessivePronoun:
                case PosTaggerOutputType.AdjectivePronoun:
                case PosTaggerOutputType.AdverbialPronoun:
                    break;

                default:
                    word.posTaggerOutputType = PosTaggerOutputType.Pronoun;
                    break;
                }
                break;

            case PartOfSpeechEnum.Verb:
                switch (word.posTaggerOutputType)
                {
                case PosTaggerOutputType.Verb:
                case PosTaggerOutputType.Infinitive:
                case PosTaggerOutputType.AdverbialParticiple:
                case PosTaggerOutputType.AuxiliaryVerb:
                case PosTaggerOutputType.Participle:
                    break;

                default:
                    word.posTaggerOutputType = PosTaggerOutputType.Verb;
                    break;
                }
                break;

            default:
                throw (new ArgumentException(singlePartOfSpeech.ToString()));
            }
        }
Example #11
0
 public unsafe PosTaggerInputTypeResult GetResult(char *_base, int length, word_t word)   //, string valueUpper )
 {
     return(PosTaggerInputTypeResult.O);
 }