Beispiel #1
0
        /// <summary>
        /// Attempt to infer the part of speech of the given preterminal node, which
        /// was created during the expansion of a multi-word token.
        /// </summary>
        private static string InferPOS(Tree t, Tree parent, TwoDimensionalCounter <string, string> unigramTagger)
        {
            string word = t.FirstChild().Value();
            string containingPhraseStr = GetContainingPhrase(t, parent);
            // Overrides: let the manual POS model handle a few special cases first
            string overrideTag = MultiWordPreprocessor.ManualUWModel.GetOverrideTag(word, containingPhraseStr);

            if (overrideTag != null)
            {
                return(overrideTag);
            }
            ICollection <string> unigramTaggerKeys = unigramTagger.FirstKeySet();

            // Try treating this word as a verb and stripping any clitic
            // pronouns. If the stripped version exists in the unigram
            // tagger, then stick with the verb hypothesis
            SpanishVerbStripper.StrippedVerb strippedVerb = verbStripper.SeparatePronouns(word);
            if (strippedVerb != null && unigramTaggerKeys.Contains(strippedVerb.GetStem()))
            {
                string pos = Counters.Argmax(unigramTagger.GetCounter(strippedVerb.GetStem()));
                if (pos.StartsWith("v"))
                {
                    return(pos);
                }
            }
            if (unigramTagger.FirstKeySet().Contains(word))
            {
                return(Counters.Argmax(unigramTagger.GetCounter(word), new MultiWordPreprocessor.POSTieBreaker()));
            }
            return(MultiWordPreprocessor.ManualUWModel.GetTag(word, containingPhraseStr));
        }
Beispiel #2
0
        /// <summary>
        /// Handles verbs with attached suffixes, marked by the lexer:
        /// Escribamosela =&gt; Escribamo + se + la =&gt; escribamos + se + la
        /// Sentaos =&gt; senta + os =&gt; sentad + os
        /// Damelo =&gt; da + me + lo
        /// </summary>
        private CoreLabel ProcessVerb(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            SpanishVerbStripper.StrippedVerb stripped = verbStripper.SeparatePronouns(cl.Word());
            if (stripped == null)
            {
                return(cl);
            }
            // Split the CoreLabel into separate labels, tracking changing begin + end
            // positions.
            int stemEnd       = cl.BeginPosition() + stripped.GetOriginalStem().Length;
            int lengthRemoved = 0;

            foreach (string pronoun in stripped.GetPronouns())
            {
                int beginOffset = stemEnd + lengthRemoved;
                compoundBuffer.Add(CopyCoreLabel(cl, pronoun, beginOffset));
                lengthRemoved += pronoun.Length;
            }
            CoreLabel stem = CopyCoreLabel(cl, stripped.GetStem(), cl.BeginPosition(), stemEnd);

            stem.SetOriginalText(stripped.GetOriginalStem());
            return(stem);
        }
Beispiel #3
0
        /// <summary>
        /// Determine whether the given clitic pronoun is an indirect object
        /// pronoun or a reflexive pronoun.
        /// </summary>
        /// <remarks>
        /// Determine whether the given clitic pronoun is an indirect object
        /// pronoun or a reflexive pronoun.
        /// This method is only defined when the pronoun is one of
        /// me, te, se, nos, os
        /// i.e., those in which the meaning is actually ambiguous.
        /// </remarks>
        /// <param name="strippedVerb">
        /// Stripped verb as returned by
        /// <see cref="Edu.Stanford.Nlp.International.Spanish.SpanishVerbStripper.SeparatePronouns(string)"/>
        /// .
        /// </param>
        /// <param name="pronounIdx">
        /// The index of the pronoun within
        /// <c>strippedVerb.getPronouns()</c>
        /// which should be
        /// disambiguated.
        /// </param>
        /// <param name="clauseYield">
        /// A string representing the yield of the
        /// clause which contains the given verb
        /// </param>
        /// <exception cref="System.ArgumentException">
        /// If the given pronoun is
        /// not ambiguous, or its disambiguation is not supported.
        /// </exception>
        public static AnCoraPronounDisambiguator.PersonalPronounType DisambiguatePersonalPronoun(SpanishVerbStripper.StrippedVerb strippedVerb, int pronounIdx, string clauseYield)
        {
            IList <string> pronouns = strippedVerb.GetPronouns();
            string         pronoun  = pronouns[pronounIdx].ToLower();

            if (!ambiguousPersonalPronouns.Contains(pronoun))
            {
                throw new ArgumentException("We don't support disambiguating pronoun '" + pronoun + "'");
            }
            if (pronouns.Count == 1 && Sharpen.Runtime.EqualsIgnoreCase(pronoun, "se"))
            {
                return(AnCoraPronounDisambiguator.PersonalPronounType.Reflexive);
            }
            string verb = strippedVerb.GetStem();

            if (alwaysReflexiveVerbs.Contains(verb))
            {
                return(AnCoraPronounDisambiguator.PersonalPronounType.Reflexive);
            }
            else
            {
                if (neverReflexiveVerbs.Contains(verb))
                {
                    return(AnCoraPronounDisambiguator.PersonalPronounType.Object);
                }
            }
            Pair <string, string> bruteForceKey = new Pair <string, string>(verb, clauseYield);

            if (bruteForceDecisions.Contains(bruteForceKey))
            {
                return(bruteForceDecisions[bruteForceKey]);
            }
            // Log this instance where a clitic pronoun could not be disambiguated.
            log.Info("Failed to disambiguate: " + verb + "\nContaining clause:\t" + clauseYield + "\n");
            return(AnCoraPronounDisambiguator.PersonalPronounType.Unknown);
        }