Ejemplo n.º 1
0
        unsafe public void Run(List <Word> words
#if DEBUG
                               , bool applyMorphoAmbiguityPreProcess
#endif
                               )
        {
            var wordMorphology = default(WordMorphology);

            for (int i = 0, wordsLength = words.Count; i < wordsLength; i++)
            {
                var word = words[i];

                if (word.posTaggerExtraWordType != PosTaggerExtraWordType.__DEFAULT__)
                {
                    _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i));
                    continue;
                }

                switch (word.posTaggerInputType)
                {
                case PosTaggerInputType.O:          // = "O"; // Другой
                case PosTaggerInputType.AllLat:     // - только латиница: нет строчных и точек;
                case PosTaggerInputType.FstC:       // - первая заглавная, не содержит пробелов;
                {
                    if (word.valueUpper == null)
                    {
                        _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i));
                        continue;
                    }

                    var mode = GetWordFormMorphologyMode(word, i);
                    wordMorphology = _morphoAnalyzer.GetWordMorphology_NoToUpper(word.valueUpper, mode);
                }
                break;

                case PosTaggerInputType.Num:        // – содержит хотя бы одну цифру и не содержит букв;
                {
                    if (word.posTaggerLastValueUpperInNumeralChain == null)
                    {
                        _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i));
                        continue;
                    }

                    var mode = GetWordFormMorphologyMode(word, i);
                    wordMorphology = _morphoAnalyzer.GetWordMorphology_4LastValueUpperInNumeralChain(
                        word.posTaggerLastValueUpperInNumeralChain, mode);
                }
                break;

                default:
                    _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i));
                    continue;
                }

                #region  post process MorphoAnalyze result
                if (wordMorphology.HasWordFormMorphologies)
                {
                    var wfms = default(WordFormMorphology[]);
                    //Если данное слово имеет только одну часть речи, прописанную в морфословаре, то использовать ее вместо определённой с помощью PoS-tagger.
                    if (wordMorphology.IsSinglePartOfSpeech)
                    {
                        CorrectPosTaggerOutputType(word, wordMorphology.PartOfSpeech);
                        wfms = wordMorphology.WordFormMorphologies.ToArray();
                        _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i, wfms));
                    }
                    else
                    {
                        #region clause #1
                        //для данного слова в морфословаре определено несколько частей речи.
                        //ищем среди морфоинформации по слову морфоинформацию по части речи от pos-tagger'а, если она есть - берем её

                        //вот эта хуйня из-за двойной трансляции
                        //{PosTaggerOutputType::AdjectivePronoun => PartOfSpeechEnum::Adjective, PartOfSpeechEnum::Pronoun}
                        // &
                        //{PosTaggerOutputType::AdverbialPronoun => PartOfSpeechEnum::Adverb, PartOfSpeechEnum::Pronoun}
                        var partOfSpeech = default(PartOfSpeechEnum?);
                        switch (word.posTaggerOutputType)
                        {
                        case PosTaggerOutputType.AdjectivePronoun:
                            {
                                wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Adjective, PartOfSpeechEnum.Pronoun);
                                if (wfms != null)
                                {
                                    _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i, wfms));
                                    continue;
                                }
                            }
                            break;

                        case PosTaggerOutputType.AdverbialPronoun:
                        {
                            wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Adverb, PartOfSpeechEnum.Pronoun);
                            if (wfms != null)
                            {
                                _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i, wfms));
                                continue;
                            }
                        }
                        break;

                        default:
                        {
                            partOfSpeech = ToPartOfSpeech(word.posTaggerOutputType);
                            if (partOfSpeech.HasValue)
                            {
                                wfms = TryGetByPosTaggerOutputType(ref wordMorphology, partOfSpeech.Value);
                                if (wfms != null)
                                {
                                    _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i, wfms));
                                    continue;
                                }
                            }
                        }
                        break;
                        }
                        #endregion

                        #region clause #2
                        //При этом  для данного слова в морфословаре определено несколько частей речи.
                        //В данном случае в первую очередь м.б. соответствия (слева выход PoS-tagger, справа морфословарь, последовательность пунктов неважна):
                        switch (word.posTaggerOutputType)
                        {
                        /*2.3. Participle = Adjective */
                        case PosTaggerOutputType.Participle:
                            wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Adjective);
                            break;

                        default:
                            #region clause #3
                        {
                            if (partOfSpeech.HasValue)
                            {
                                switch (partOfSpeech.Value)
                                {
                                /*3.1. Pronoun = Noun*/
                                case PartOfSpeechEnum.Pronoun:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Noun);
                                    break;

                                /*3.2. Noun = Pronoun */
                                case PartOfSpeechEnum.Noun:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Pronoun);
                                    break;

                                /*3.3. Conjunction = Particle */
                                case PartOfSpeechEnum.Conjunction:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Particle);
                                    break;

                                /*3.4. Particle = Conjunction*/
                                case PartOfSpeechEnum.Particle:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Conjunction);
                                    break;

                                /*3.5. Numeral = Noun, Adjective */
                                case PartOfSpeechEnum.Numeral:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Noun, PartOfSpeechEnum.Adjective);
                                    break;

                                /*3.6. Adjective = Verb, Adverb*/
                                case PartOfSpeechEnum.Adjective:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Verb, PartOfSpeechEnum.Adverb);
                                    break;

                                /*3.7. Adverb = Adjective */
                                case PartOfSpeechEnum.Adverb:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Adjective);
                                    break;
                                }
                            }
                        }
                            #endregion
                            break;
                        }

                        /*Если таковых соответствий не нашлось, то берется первая из выдачи морфословаря часть речи.*/
                        if (wfms == null)
                        {
                            var _partOfSpeech = wordMorphology.WordFormMorphologies[0].PartOfSpeech;
                            word.posTaggerOutputType = ToPosTaggerOutputType(_partOfSpeech);
                            wfms = TryGetByPosTaggerOutputType(ref wordMorphology, _partOfSpeech);
                        }

                        _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i, wfms));

                        #endregion
                    }
                }
                else
                {
                    _wordMorphoAmbiguities.Add(_wordMorphoAmbiguityFactory.Create(word, i));
                }
                #endregion
            }

#if DEBUG
            if (applyMorphoAmbiguityPreProcess)
            {
#endif
            _morphoAmbiguityPreProcessor.Run(_wordMorphoAmbiguities);
#if DEBUG
        }
#endif
            _morphoAmbiguityResolver.Resolve(_wordMorphoAmbiguities);

            _wordMorphoAmbiguities.Clear();
        }
        unsafe public void Run(List <word_t> words
#if DEBUG
                               , bool applyMorphoAmbiguityPreProcess
#endif
                               )
        {
            //-apply morpho-analysis-//
            var wordMorphology = default(WordMorphology_t);

            for (int i = 0, wordsLength = words.Count; i < wordsLength; i++)
            {
                var word = words[i];

                #region [.sikp something & make Morpho-Analyze.]
                if (word.posTaggerExtraWordType != PosTaggerExtraWordType.__DEFAULT__)
                {
                    _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i));
                    continue;
                }

                switch (word.posTaggerInputType)
                {
                case PosTaggerInputType.O:          // = "O"; // Другой
                case PosTaggerInputType.AllLat:     // - только латиница: нет строчных и точек;
                case PosTaggerInputType.FstC:       // - первая заглавная, не содержит пробелов;
                    #region
                {
                    if (word.valueUpper == null)
                    {
                        _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i));
                        continue;
                    }

                    #if DEBUG
                    var wordFirstCharIsUpper = (*(_CTM + word.valueOriginal[0]) & CharType.IsUpper) == CharType.IsUpper;
                    Debug.Assert(wordFirstCharIsUpper == word.posTaggerFirstCharIsUpper, "(wordFirstCharIsUpper != word.posTaggerFirstCharIsUpper)");
                    #endif

                    var mode = GetWordFormMorphologyMode(word, i);
                    wordMorphology = _MorphoAnalyzer.GetWordMorphology_NoToUpper(word.valueUpper, mode);

                    //---!!!---DONT-FREE-USED-IN-FURTHER---// word.valueUpper = null; //free resource
                }
                    #endregion
                    break;

                case PosTaggerInputType.Num:        // – содержит хотя бы одну цифру и не содержит букв;
                    #region
                {
                    //---Debug.Assert( word.valueUpper == null, "word.valueUpper != null" );---//
                    if (word.posTaggerLastValueUpperInNumeralChain == null)
                    {
                        _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i));
                        continue;
                    }

                    var mode = GetWordFormMorphologyMode(word, i);
                    wordMorphology = _MorphoAnalyzer.GetWordMorphology_4LastValueUpperInNumeralChain(
                        word.posTaggerLastValueUpperInNumeralChain, mode);

                    #region [.commented.]

                    /*
                     * fixed ( char* ptr = word.posTaggerLastValueOriginalInNumeralChain )
                     * {
                     *  //-get-first-is-upper-from-valueOriginal-//
                     *  //---var wordFirstCharIsUpper = (*(_CTM + *ptr) & CharType.IsUpper) == CharType.IsUpper;
                     *  //---var wordFormMorphologyMode = GetWordFormMorphologyMode( word, wordFirstCharIsUpper, i );
                     *  var mode = GetWordFormMorphologyMode( word, i );
                     *  var length = word.posTaggerLastValueOriginalInNumeralChain.Length;
                     *  wordMorphology = _MorphoAnalyzer.GetWordMorphology_4LastValueOriginalInNumeralChain( ptr, length, mode );
                     * }
                     */
                    #endregion

                    //---!!!---DONT-FREE-USED-IN-FURTHER---// word.posTaggerLastValueOriginalInNumeralChain = null; //free resource
                }
                    #endregion
                    break;

                default:
                    #region [.who's skip.]
                    //CompPh – составные (имеющие хотя бы один пробел);
                    //Col    – двоеточие.
                    //Com    – запятая;
                    //Dush   – тире;
                    //OneCP  – первая заглавная с точкой;
                    #endregion
                    _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i));
                    continue;
                }
                #endregion

                #region [.post-process Morpho-Analyze-result.]
                if (wordMorphology.HasWordFormMorphologies)
                {
                    var wfms = default(WordFormMorphology_t[]);
                    //Если данное слово имеет только одну часть речи, прописанную в морфословаре, то использовать ее вместо определённой с помощью PoS-tagger.
                    if (wordMorphology.IsSinglePartOfSpeech)
                    {
                        CorrectPosTaggerOutputType(word, wordMorphology.PartOfSpeech);
                        wfms = wordMorphology.WordFormMorphologies.ToArray();
                        _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i, wfms));
                        #if DEBUG
                        word.morphologies = wfms;
                        #endif
                    }
                    else
                    {
                        #region [.clause #1.]
                        //для данного слова в морфословаре определено несколько частей речи.
                        //ищем среди морфоинформации по слову морфоинформацию по части речи от pos-tagger'а, если она есть - берем её

                        //вот эта хуйня из-за двойной трансляции
                        //{PosTaggerOutputType::AdjectivePronoun => PartOfSpeechEnum::Adjective, PartOfSpeechEnum::Pronoun}
                        // &
                        //{PosTaggerOutputType::AdverbialPronoun => PartOfSpeechEnum::Adverb, PartOfSpeechEnum::Pronoun}
                        var partOfSpeech = default(PartOfSpeechEnum?);
                        switch (word.posTaggerOutputType)
                        {
                        case PosTaggerOutputType.AdjectivePronoun:
                            #region
                            {
                                wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Adjective, PartOfSpeechEnum.Pronoun);
                                if (wfms != null)
                                {
                                    _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i, wfms));
                                    #if DEBUG
                                    word.morphologies = wfms;
                                    #endif
                                    continue;
                                }
                            }
                            #endregion
                            break;

                        case PosTaggerOutputType.AdverbialPronoun:
                            #region
                        {
                            wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Adverb, PartOfSpeechEnum.Pronoun);
                            if (wfms != null)
                            {
                                _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i, wfms));
                                    #if DEBUG
                                word.morphologies = wfms;
                                    #endif
                                continue;
                            }
                        }
                            #endregion
                            break;

                        default:
                            #region
                        {
                            partOfSpeech = ToPartOfSpeech(word.posTaggerOutputType);
                            if (partOfSpeech.HasValue)
                            {
                                wfms = TryGetByPosTaggerOutputType(ref wordMorphology, partOfSpeech.Value);
                                if (wfms != null)
                                {
                                    _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i, wfms));
                                        #if DEBUG
                                    word.morphologies = wfms;
                                        #endif
                                    continue;
                                }
                            }
                        }
                            #endregion
                            break;
                        }
                        #endregion

                        #region [.clause #2.]
                        //При этом  для данного слова в морфословаре определено несколько частей речи.
                        //В данном случае в первую очередь м.б. соответствия (слева выход PoS-tagger, справа морфословарь, последовательность пунктов неважна):
                        switch (word.posTaggerOutputType)
                        {
                            #region [.commented. previous.]

                            /*
                             * /*2.1. AdjectivePronoun = Adjective* /
                             * case PosTaggerOutputType.AdjectivePronoun:
                             *  wfms = TryGetByPosTaggerOutputType( ref wordMorphology, PartOfSpeechEnum.Adjective );
                             #if DEBUG
                             *      word.morphologies = wfms;
                             #endif
                             * break;
                             *
                             * /*2.2. AdverbialPronoun = Adverb* /
                             * case PosTaggerOutputType.AdverbialPronoun:
                             *  wfms = TryGetByPosTaggerOutputType( ref wordMorphology, PartOfSpeechEnum.Adverb );
                             #if DEBUG
                             *      word.morphologies = wfms;
                             #endif
                             * break;
                             */
                            #endregion

                        /*2.3. Participle = Adjective */
                        case PosTaggerOutputType.Participle:
                            wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Adjective);
                                #if DEBUG
                            word.morphologies = wfms;
                                #endif
                            break;

                        default:
                            #region [.clause #3.]
                        {
                            if (partOfSpeech.HasValue)
                            {
                                switch (partOfSpeech.Value)
                                {
                                /*3.1. Pronoun = Noun*/
                                case PartOfSpeechEnum.Pronoun:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Noun);
                                            #if DEBUG
                                    word.morphologies = wfms;
                                            #endif
                                    break;

                                /*3.2. Noun = Pronoun */
                                case PartOfSpeechEnum.Noun:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Pronoun);
                                            #if DEBUG
                                    word.morphologies = wfms;
                                            #endif
                                    break;

                                /*3.3. Conjunction = Particle */
                                case PartOfSpeechEnum.Conjunction:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Particle);
                                            #if DEBUG
                                    word.morphologies = wfms;
                                            #endif
                                    break;

                                /*3.4. Particle = Conjunction*/
                                case PartOfSpeechEnum.Particle:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Conjunction);
                                            #if DEBUG
                                    word.morphologies = wfms;
                                            #endif
                                    break;

                                /*3.5. Numeral = Noun, Adjective */
                                case PartOfSpeechEnum.Numeral:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Noun, PartOfSpeechEnum.Adjective);
                                            #if DEBUG
                                    word.morphologies = wfms;
                                            #endif
                                    break;

                                /*3.6. Adjective = Verb, Adverb*/
                                case PartOfSpeechEnum.Adjective:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Verb, PartOfSpeechEnum.Adverb);
                                            #if DEBUG
                                    word.morphologies = wfms;
                                            #endif
                                    break;

                                /*3.7. Adverb = Adjective */
                                case PartOfSpeechEnum.Adverb:
                                    wfms = TryGetByPosTaggerOutputType(ref wordMorphology, PartOfSpeechEnum.Adjective);
                                            #if DEBUG
                                    word.morphologies = wfms;
                                            #endif
                                    break;
                                }
                            }
                        }
                            #endregion
                            break;
                        }

                        /*Если таковых соответствий не нашлось, то берется первая из выдачи морфословаря часть речи.*/
                        if (wfms == null)
                        {
                            var _partOfSpeech = wordMorphology.WordFormMorphologies[0].PartOfSpeech;
                            word.posTaggerOutputType = ToPosTaggerOutputType(_partOfSpeech);
                            wfms = TryGetByPosTaggerOutputType(ref wordMorphology, _partOfSpeech);
                            #if DEBUG
                            word.morphologies = wfms;
                            #endif
                        }

                        _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i, wfms));

                        #endregion
                    }
                }
                else
                {
                    _WordMorphoAmbiguities.Add(_WordMorphoAmbiguityFactory.Create(word, i));
                }
                #endregion
            }

            //-pre-process morpho-ambiguity-//
#if DEBUG
            if (applyMorphoAmbiguityPreProcess)
            {
#endif
            _MorphoAmbiguityPreProcessor.Run(_WordMorphoAmbiguities);
#if DEBUG
        }
#endif
            //-resolve morpho-ambiguity-//
            _MorphoAmbiguityResolver.Resolve(_WordMorphoAmbiguities);

            //-clear-//
            _WordMorphoAmbiguities.Clear();
        }