Пример #1
0
        /// <summary>
        /// Handles contractions like del and al, marked by the lexer
        /// del =&gt; de + l =&gt; de + el
        /// al =&gt; a + l =&gt; a + el
        /// con[mts]igo =&gt; con + [mts]i
        /// </summary>
        private CoreLabel ProcessContraction(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            string word = cl.Word();
            string first;
            string second;
            int    secondOffset = 0;
            int    secondLength = 0;
            string lowered      = word.ToLower();

            switch (lowered)
            {
            case "del":
            case "al":
            {
                first = Sharpen.Runtime.Substring(word, 0, lowered.Length - 1);
                char lastChar = word[lowered.Length - 1];
                if (char.IsLowerCase(lastChar))
                {
                    second = "el";
                }
                else
                {
                    second = "EL";
                }
                secondOffset = 1;
                secondLength = lowered.Length - 1;
                break;
            }

            case "conmigo":
            case "consigo":
            {
                first        = Sharpen.Runtime.Substring(word, 0, 3);
                second       = word[3] + "í";
                secondOffset = 3;
                secondLength = 4;
                break;
            }

            case "contigo":
            {
                first        = Sharpen.Runtime.Substring(word, 0, 3);
                second       = Sharpen.Runtime.Substring(word, 3, 5);
                secondOffset = 3;
                secondLength = 4;
                break;
            }

            default:
            {
                throw new ArgumentException("Invalid contraction provided to processContraction");
            }
            }
            int secondStart = cl.BeginPosition() + secondOffset;
            int secondEnd   = secondStart + secondLength;

            compoundBuffer.Add(CopyCoreLabel(cl, second, secondStart, secondEnd));
            return(CopyCoreLabel(cl, first, cl.BeginPosition(), secondStart));
        }
Пример #2
0
        /// <summary>Splits a contraction marked by the lexer.</summary>
        /// <remarks>
        /// Splits a contraction marked by the lexer.
        /// au =&gt; a + u =&gt; à + le
        /// aux =&gt; a + ux =&gt; à + les
        /// des =&gt; de + s =&gt; de + les
        /// du =&gt; d + u =&gt; de + le
        /// </remarks>
        private CoreLabel ProcessContraction(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            string word = cl.Word();
            string first;
            string second;
            int    secondOffset = 0;
            int    secondLength = 0;
            string lowered      = word.ToLower();

            switch (lowered)
            {
            case "au":
            {
                first        = "à";
                second       = "le";
                secondOffset = 1;
                secondLength = 1;
                break;
            }

            case "aux":
            {
                first        = "à";
                second       = "les";
                secondOffset = 1;
                secondLength = 2;
                break;
            }

            case "du":
            {
                first        = "de";
                second       = "le";
                secondOffset = 1;
                secondLength = 1;
                break;
            }

            default:
            {
                throw new ArgumentException("Invalid contraction provided to processContraction");
            }
            }
            int secondStart = cl.BeginPosition() + secondOffset;
            int secondEnd   = secondStart + secondLength;

            compoundBuffer.Add(CopyCoreLabel(cl, second, secondStart, secondEnd));
            return(CopyCoreLabel(cl, first, cl.BeginPosition(), secondStart));
        }
Пример #3
0
        //convert token range to char range, check if charIndex is in it.
        public virtual bool RangeContainsCharIndex(Pair <int, int> tokenRange, int charIndex)
        {
            IList <CoreLabel> tokens     = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            CoreLabel         startToken = tokens[tokenRange.First()];
            CoreLabel         endToken   = tokens[tokenRange.Second()];
            int startTokenCharBegin      = startToken.BeginPosition();
            int endTokenCharEnd          = endToken.EndPosition();

            return(startTokenCharBegin <= charIndex && charIndex <= endTokenCharEnd);
        }
Пример #4
0
        /// <summary>Splits a compound marked by the lexer.</summary>
        private CoreLabel ProcessCompound(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            string[] parts       = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - "));
            int      lengthAccum = 0;

            foreach (string part in parts)
            {
                CoreLabel newLabel = new CoreLabel(cl);
                newLabel.SetWord(part);
                newLabel.SetValue(part);
                newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum);
                newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length);
                newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
                compoundBuffer.Add(newLabel);
                lengthAccum += part.Length;
            }
            return(compoundBuffer.Remove(0));
        }
Пример #5
0
        /// <summary>
        /// Handles verbs with attached suffixes, marked by the lexer:
        /// Escribamosela =&gt; Escribamo + se + la =&gt; escribamos + se + la
        /// Sentaos =&gt; senta + os =&gt; sentad + os
        /// Damelo =&gt; da + me + lo
        /// </summary>
        private CoreLabel ProcessVerb(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            SpanishVerbStripper.StrippedVerb stripped = verbStripper.SeparatePronouns(cl.Word());
            if (stripped == null)
            {
                return(cl);
            }
            // Split the CoreLabel into separate labels, tracking changing begin + end
            // positions.
            int stemEnd       = cl.BeginPosition() + stripped.GetOriginalStem().Length;
            int lengthRemoved = 0;

            foreach (string pronoun in stripped.GetPronouns())
            {
                int beginOffset = stemEnd + lengthRemoved;
                compoundBuffer.Add(CopyCoreLabel(cl, pronoun, beginOffset));
                lengthRemoved += pronoun.Length;
            }
            CoreLabel stem = CopyCoreLabel(cl, stripped.GetStem(), cl.BeginPosition(), stemEnd);

            stem.SetOriginalText(stripped.GetOriginalStem());
            return(stem);
        }
Пример #6
0
        /// <summary>Convert token to a sequence of datums and add to iobList.</summary>
        /// <param name="iobList"/>
        /// <param name="token"/>
        /// <param name="tokType"/>
        /// <param name="tokenLabel"/>
        /// <param name="lastToken"/>
        /// <param name="applyRewriteRules"/>
        /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param>
        /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param>
        private static void TokenToDatums(IList <CoreLabel> iobList, CoreLabel cl, string token, IOBUtils.TokenType tokType, CoreLabel tokenLabel, string lastToken, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText
                                          )
        {
            if (token.IsEmpty())
            {
                return;
            }
            string lastLabel        = ContinuationSymbol;
            string firstLabel       = BeginSymbol;
            string rewritten        = cl.Get(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation));
            bool   crossRefRewrites = true;

            if (rewritten == null)
            {
                rewritten        = token;
                crossRefRewrites = false;
            }
            else
            {
                rewritten = StripSegmentationMarkers(rewritten, tokType);
            }
            if (applyRewriteRules)
            {
                // Apply Arabic-specific re-write rules
                string rawToken = tokenLabel.Word();
                string tag      = tokenLabel.Tag();
                MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification();
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Ngen);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Nnum);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Def);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Tense);
                MorphoFeatures features = featureSpec.StrToFeatures(tag);
                // Rule #1 : ت --> ة
                if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Ngen).Equals("F") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Nnum).Equals("SG") && rawToken.EndsWith("ت-") && !stripRewrites)
                {
                    lastLabel = RewriteSymbol;
                }
                else
                {
                    if (rawToken.EndsWith("ة-"))
                    {
                        System.Diagnostics.Debug.Assert(token.EndsWith("ة"));
                        token     = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ت";
                        lastLabel = RewriteSymbol;
                    }
                }
                // Rule #2 : لل --> ل ال
                if (lastToken.Equals("ل") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Def).Equals("D"))
                {
                    if (rawToken.StartsWith("-ال"))
                    {
                        if (!token.StartsWith("ا"))
                        {
                            log.Info("Bad REWAL: " + rawToken + " / " + token);
                        }
                        token     = Sharpen.Runtime.Substring(token, 1);
                        rewritten = Sharpen.Runtime.Substring(rewritten, 1);
                        if (!stripRewrites)
                        {
                            firstLabel = RewriteSymbol;
                        }
                    }
                    else
                    {
                        if (rawToken.StartsWith("-ل"))
                        {
                            if (!token.StartsWith("ل"))
                            {
                                log.Info("Bad REWAL: " + rawToken + " / " + token);
                            }
                            if (!stripRewrites)
                            {
                                firstLabel = RewriteSymbol;
                            }
                        }
                        else
                        {
                            log.Info("Ignoring REWAL: " + rawToken + " / " + token);
                        }
                    }
                }
                // Rule #3 : ي --> ى
                // Rule #4 : ا --> ى
                if (rawToken.EndsWith("ى-"))
                {
                    if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Tense) != null)
                    {
                        // verb: ى becomes ا
                        token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ا";
                    }
                    else
                    {
                        // assume preposition:
                        token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ي";
                    }
                    if (!stripRewrites)
                    {
                        lastLabel = RewriteSymbol;
                    }
                }
                else
                {
                    if (rawToken.Equals("علي-") || rawToken.Equals("-علي-"))
                    {
                        if (!stripRewrites)
                        {
                            lastLabel = RewriteSymbol;
                        }
                    }
                }
            }
            string origWord;

            if (origText == null)
            {
                origWord = tokenLabel.Word();
            }
            else
            {
                origWord = Sharpen.Runtime.Substring(origText, cl.BeginPosition(), cl.EndPosition());
            }
            int origIndex = 0;

            while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf))
            {
                ++origIndex;
            }
            // Create datums and add to iobList
            if (token.IsEmpty())
            {
                log.Info("Rewriting resulted in empty token: " + tokenLabel.Word());
            }
            string firstChar = token[0].ToString();

            // Start at 0 to make sure we include the whole token according to the tokenizer
            iobList.Add(CreateDatum(cl, firstChar, firstLabel, 0, origIndex + 1));
            int numChars = token.Length;

            if (crossRefRewrites && rewritten.Length != numChars)
            {
                System.Console.Error.Printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten);
                crossRefRewrites = false;
            }
            ++origIndex;
            for (int j = 1; j < numChars; ++j, ++origIndex)
            {
                while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf))
                {
                    ++origIndex;
                }
                if (origIndex >= origWord.Length)
                {
                    origIndex = origWord.Length - 1;
                }
                string charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol;
                string thisChar  = token[j].ToString();
                if (crossRefRewrites && !rewritten[j].ToString().Equals(thisChar))
                {
                    charLabel = RewriteSymbol;
                }
                if (charLabel == ContinuationSymbol && thisChar.Equals("ى") && j != numChars - 1)
                {
                    charLabel = RewriteSymbol;
                }
                // Assume all mid-word alef maqsura are supposed to be yah
                iobList.Add(CreateDatum(cl, thisChar, charLabel, origIndex, origIndex + 1));
            }
            // End at endPosition to make sure we include the whole token according to the tokenizer
            if (!iobList.IsEmpty())
            {
                iobList[iobList.Count - 1].SetEndPosition(cl.EndPosition());
            }
        }