/// <summary> /// Handles contractions like del and al, marked by the lexer /// del => de + l => de + el /// al => a + l => a + el /// con[mts]igo => con + [mts]i /// </summary> private CoreLabel ProcessContraction(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string word = cl.Word(); string first; string second; int secondOffset = 0; int secondLength = 0; string lowered = word.ToLower(); switch (lowered) { case "del": case "al": { first = Sharpen.Runtime.Substring(word, 0, lowered.Length - 1); char lastChar = word[lowered.Length - 1]; if (char.IsLowerCase(lastChar)) { second = "el"; } else { second = "EL"; } secondOffset = 1; secondLength = lowered.Length - 1; break; } case "conmigo": case "consigo": { first = Sharpen.Runtime.Substring(word, 0, 3); second = word[3] + "í"; secondOffset = 3; secondLength = 4; break; } case "contigo": { first = Sharpen.Runtime.Substring(word, 0, 3); second = Sharpen.Runtime.Substring(word, 3, 5); secondOffset = 3; secondLength = 4; break; } default: { throw new ArgumentException("Invalid contraction provided to processContraction"); } } int secondStart = cl.BeginPosition() + secondOffset; int secondEnd = secondStart + secondLength; compoundBuffer.Add(CopyCoreLabel(cl, second, secondStart, secondEnd)); return(CopyCoreLabel(cl, first, cl.BeginPosition(), secondStart)); }
/// <summary>Splits a contraction marked by the lexer.</summary> /// <remarks> /// Splits a contraction marked by the lexer. /// au => a + u => à + le /// aux => a + ux => à + les /// des => de + s => de + les /// du => d + u => de + le /// </remarks> private CoreLabel ProcessContraction(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string word = cl.Word(); string first; string second; int secondOffset = 0; int secondLength = 0; string lowered = word.ToLower(); switch (lowered) { case "au": { first = "à"; second = "le"; secondOffset = 1; secondLength = 1; break; } case "aux": { first = "à"; second = "les"; secondOffset = 1; secondLength = 2; break; } case "du": { first = "de"; second = "le"; secondOffset = 1; secondLength = 1; break; } default: { throw new ArgumentException("Invalid contraction provided to processContraction"); } } int secondStart = cl.BeginPosition() + secondOffset; int secondEnd = secondStart + secondLength; compoundBuffer.Add(CopyCoreLabel(cl, second, secondStart, secondEnd)); return(CopyCoreLabel(cl, first, cl.BeginPosition(), secondStart)); }
//convert token range to char range, check if charIndex is in it. public virtual bool RangeContainsCharIndex(Pair <int, int> tokenRange, int charIndex) { IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); CoreLabel startToken = tokens[tokenRange.First()]; CoreLabel endToken = tokens[tokenRange.Second()]; int startTokenCharBegin = startToken.BeginPosition(); int endTokenCharEnd = endToken.EndPosition(); return(startTokenCharBegin <= charIndex && charIndex <= endTokenCharEnd); }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - ")); int lengthAccum = 0; foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum); newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); lengthAccum += part.Length; } return(compoundBuffer.Remove(0)); }
/// <summary> /// Handles verbs with attached suffixes, marked by the lexer: /// Escribamosela => Escribamo + se + la => escribamos + se + la /// Sentaos => senta + os => sentad + os /// Damelo => da + me + lo /// </summary> private CoreLabel ProcessVerb(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); SpanishVerbStripper.StrippedVerb stripped = verbStripper.SeparatePronouns(cl.Word()); if (stripped == null) { return(cl); } // Split the CoreLabel into separate labels, tracking changing begin + end // positions. int stemEnd = cl.BeginPosition() + stripped.GetOriginalStem().Length; int lengthRemoved = 0; foreach (string pronoun in stripped.GetPronouns()) { int beginOffset = stemEnd + lengthRemoved; compoundBuffer.Add(CopyCoreLabel(cl, pronoun, beginOffset)); lengthRemoved += pronoun.Length; } CoreLabel stem = CopyCoreLabel(cl, stripped.GetStem(), cl.BeginPosition(), stemEnd); stem.SetOriginalText(stripped.GetOriginalStem()); return(stem); }
/// <summary>Convert token to a sequence of datums and add to iobList.</summary> /// <param name="iobList"/> /// <param name="token"/> /// <param name="tokType"/> /// <param name="tokenLabel"/> /// <param name="lastToken"/> /// <param name="applyRewriteRules"/> /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param> /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param> private static void TokenToDatums(IList <CoreLabel> iobList, CoreLabel cl, string token, IOBUtils.TokenType tokType, CoreLabel tokenLabel, string lastToken, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText ) { if (token.IsEmpty()) { return; } string lastLabel = ContinuationSymbol; string firstLabel = BeginSymbol; string rewritten = cl.Get(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation)); bool crossRefRewrites = true; if (rewritten == null) { rewritten = token; crossRefRewrites = false; } else { rewritten = StripSegmentationMarkers(rewritten, tokType); } if (applyRewriteRules) { // Apply Arabic-specific re-write rules string rawToken = tokenLabel.Word(); string tag = tokenLabel.Tag(); MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification(); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Ngen); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Nnum); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Def); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Tense); MorphoFeatures features = featureSpec.StrToFeatures(tag); // Rule #1 : ت --> ة if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Ngen).Equals("F") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Nnum).Equals("SG") && rawToken.EndsWith("ت-") && !stripRewrites) { lastLabel = RewriteSymbol; } else { if (rawToken.EndsWith("ة-")) { System.Diagnostics.Debug.Assert(token.EndsWith("ة")); token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ت"; lastLabel = RewriteSymbol; } } // Rule #2 : لل --> ل ال if (lastToken.Equals("ل") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Def).Equals("D")) { if (rawToken.StartsWith("-ال")) { if (!token.StartsWith("ا")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } token = Sharpen.Runtime.Substring(token, 1); rewritten = Sharpen.Runtime.Substring(rewritten, 1); if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { if (rawToken.StartsWith("-ل")) { if (!token.StartsWith("ل")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { log.Info("Ignoring REWAL: " + rawToken + " / " + token); } } } // Rule #3 : ي --> ى // Rule #4 : ا --> ى if (rawToken.EndsWith("ى-")) { if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Tense) != null) { // verb: ى becomes ا token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ا"; } else { // assume preposition: token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ي"; } if (!stripRewrites) { lastLabel = RewriteSymbol; } } else { if (rawToken.Equals("علي-") || rawToken.Equals("-علي-")) { if (!stripRewrites) { lastLabel = RewriteSymbol; } } } } string origWord; if (origText == null) { origWord = tokenLabel.Word(); } else { origWord = Sharpen.Runtime.Substring(origText, cl.BeginPosition(), cl.EndPosition()); } int origIndex = 0; while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } // Create datums and add to iobList if (token.IsEmpty()) { log.Info("Rewriting resulted in empty token: " + tokenLabel.Word()); } string firstChar = token[0].ToString(); // Start at 0 to make sure we include the whole token according to the tokenizer iobList.Add(CreateDatum(cl, firstChar, firstLabel, 0, origIndex + 1)); int numChars = token.Length; if (crossRefRewrites && rewritten.Length != numChars) { System.Console.Error.Printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten); crossRefRewrites = false; } ++origIndex; for (int j = 1; j < numChars; ++j, ++origIndex) { while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } if (origIndex >= origWord.Length) { origIndex = origWord.Length - 1; } string charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol; string thisChar = token[j].ToString(); if (crossRefRewrites && !rewritten[j].ToString().Equals(thisChar)) { charLabel = RewriteSymbol; } if (charLabel == ContinuationSymbol && thisChar.Equals("ى") && j != numChars - 1) { charLabel = RewriteSymbol; } // Assume all mid-word alef maqsura are supposed to be yah iobList.Add(CreateDatum(cl, thisChar, charLabel, origIndex, origIndex + 1)); } // End at endPosition to make sure we include the whole token according to the tokenizer if (!iobList.IsEmpty()) { iobList[iobList.Count - 1].SetEndPosition(cl.EndPosition()); } }