//convert token range to char range, check if charIndex is in it. public virtual bool RangeContainsCharIndex(Pair <int, int> tokenRange, int charIndex) { IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); CoreLabel startToken = tokens[tokenRange.First()]; CoreLabel endToken = tokens[tokenRange.Second()]; int startTokenCharBegin = startToken.BeginPosition(); int endTokenCharEnd = endToken.EndPosition(); return(startTokenCharBegin <= charIndex && charIndex <= endTokenCharEnd); }
/// <summary>Convert token to a sequence of datums and add to iobList.</summary> /// <param name="iobList"/> /// <param name="token"/> /// <param name="tokType"/> /// <param name="tokenLabel"/> /// <param name="lastToken"/> /// <param name="applyRewriteRules"/> /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param> /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param> private static void TokenToDatums(IList <CoreLabel> iobList, CoreLabel cl, string token, IOBUtils.TokenType tokType, CoreLabel tokenLabel, string lastToken, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText ) { if (token.IsEmpty()) { return; } string lastLabel = ContinuationSymbol; string firstLabel = BeginSymbol; string rewritten = cl.Get(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation)); bool crossRefRewrites = true; if (rewritten == null) { rewritten = token; crossRefRewrites = false; } else { rewritten = StripSegmentationMarkers(rewritten, tokType); } if (applyRewriteRules) { // Apply Arabic-specific re-write rules string rawToken = tokenLabel.Word(); string tag = tokenLabel.Tag(); MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification(); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Ngen); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Nnum); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Def); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Tense); MorphoFeatures features = featureSpec.StrToFeatures(tag); // Rule #1 : ت --> ة if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Ngen).Equals("F") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Nnum).Equals("SG") && rawToken.EndsWith("ت-") && !stripRewrites) { lastLabel = RewriteSymbol; } else { if (rawToken.EndsWith("ة-")) { System.Diagnostics.Debug.Assert(token.EndsWith("ة")); token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ت"; lastLabel = RewriteSymbol; } } // Rule #2 : لل --> ل ال if (lastToken.Equals("ل") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Def).Equals("D")) { if (rawToken.StartsWith("-ال")) { if (!token.StartsWith("ا")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } token = Sharpen.Runtime.Substring(token, 1); rewritten = Sharpen.Runtime.Substring(rewritten, 1); if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { if (rawToken.StartsWith("-ل")) { if (!token.StartsWith("ل")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { log.Info("Ignoring REWAL: " + rawToken + " / " + token); } } } // Rule #3 : ي --> ى // Rule #4 : ا --> ى if (rawToken.EndsWith("ى-")) { if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Tense) != null) { // verb: ى becomes ا token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ا"; } else { // assume preposition: token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ي"; } if (!stripRewrites) { lastLabel = RewriteSymbol; } } else { if (rawToken.Equals("علي-") || rawToken.Equals("-علي-")) { if (!stripRewrites) { lastLabel = RewriteSymbol; } } } } string origWord; if (origText == null) { origWord = tokenLabel.Word(); } else { origWord = Sharpen.Runtime.Substring(origText, cl.BeginPosition(), cl.EndPosition()); } int origIndex = 0; while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } // Create datums and add to iobList if (token.IsEmpty()) { log.Info("Rewriting resulted in empty token: " + tokenLabel.Word()); } string firstChar = token[0].ToString(); // Start at 0 to make sure we include the whole token according to the tokenizer iobList.Add(CreateDatum(cl, firstChar, firstLabel, 0, origIndex + 1)); int numChars = token.Length; if (crossRefRewrites && rewritten.Length != numChars) { System.Console.Error.Printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten); crossRefRewrites = false; } ++origIndex; for (int j = 1; j < numChars; ++j, ++origIndex) { while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } if (origIndex >= origWord.Length) { origIndex = origWord.Length - 1; } string charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol; string thisChar = token[j].ToString(); if (crossRefRewrites && !rewritten[j].ToString().Equals(thisChar)) { charLabel = RewriteSymbol; } if (charLabel == ContinuationSymbol && thisChar.Equals("ى") && j != numChars - 1) { charLabel = RewriteSymbol; } // Assume all mid-word alef maqsura are supposed to be yah iobList.Add(CreateDatum(cl, thisChar, charLabel, origIndex, origIndex + 1)); } // End at endPosition to make sure we include the whole token according to the tokenizer if (!iobList.IsEmpty()) { iobList[iobList.Count - 1].SetEndPosition(cl.EndPosition()); } }