/// <summary>Deterministically classify a token.</summary> private static IOBUtils.TokenType GetTokenType(string token, string segMarker) { if (segMarker == null || token.Equals(segMarker)) { return(IOBUtils.TokenType.NoMarker); } IOBUtils.TokenType tokType = IOBUtils.TokenType.NoMarker; bool startsWithMarker = token.StartsWith(segMarker); bool endsWithMarker = token.EndsWith(segMarker); if (startsWithMarker && endsWithMarker) { tokType = IOBUtils.TokenType.BothMarker; } else { if (startsWithMarker) { tokType = IOBUtils.TokenType.BeginMarker; } else { if (endsWithMarker) { tokType = IOBUtils.TokenType.EndMarker; } } } return(tokType); }
/// <summary>Strip segmentation markers.</summary> private static string StripSegmentationMarkers(string tok, IOBUtils.TokenType tokType) { int beginOffset = (tokType == IOBUtils.TokenType.BeginMarker || tokType == IOBUtils.TokenType.BothMarker) ? 1 : 0; int endOffset = (tokType == IOBUtils.TokenType.EndMarker || tokType == IOBUtils.TokenType.BothMarker) ? tok.Length - 1 : tok.Length; return(tokType == IOBUtils.TokenType.NoMarker ? tok : Sharpen.Runtime.Substring(tok, beginOffset, endOffset)); }
/// <summary> /// Convert a String to a list of characters suitable for labeling in an IOB /// segmentation model. /// </summary> /// <param name="tokenList"/> /// <param name="segMarker"/> /// <param name="applyRewriteRules">add rewrite labels (for training data)</param> /// <param name="stripRewrites"> /// revert training data to old Green and DeNero model (remove /// rewrite labels but still rewrite to try to preserve raw text) /// </param> /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param> /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param> public static IList <CoreLabel> StringToIOB(IList <CoreLabel> tokenList, char segMarker, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText) { IList <CoreLabel> iobList = new List <CoreLabel>(tokenList.Count * 7 + tokenList.Count); string strSegMarker = segMarker.ToString(); bool addWhitespace = false; int numTokens = tokenList.Count; string lastToken = string.Empty; string currentWord = string.Empty; int wordStartIndex = 0; foreach (CoreLabel cl in tokenList) { // What type of token is this if (addWhitespace) { FillInWordStatistics(iobList, currentWord, wordStartIndex); currentWord = string.Empty; wordStartIndex = iobList.Count + 1; iobList.Add(CreateDatum(cl, BoundaryChar, BoundarySymbol)); CoreLabel boundaryDatum = iobList[iobList.Count - 1]; boundaryDatum.SetIndex(0); boundaryDatum.SetWord(string.Empty); addWhitespace = false; } string token = cl.Word(); IOBUtils.TokenType tokType = GetTokenType(token, strSegMarker); token = StripSegmentationMarkers(token, tokType); System.Diagnostics.Debug.Assert(token.Length != 0); if (ShouldNotSegment(token)) { iobList.Add(CreateDatum(cl, token, NosegSymbol)); addWhitespace = true; } else { // Iterate over the characters in the token TokenToDatums(iobList, cl, token, tokType, cl, lastToken, applyRewriteRules, stripRewrites, tf, origText); addWhitespace = (tokType == IOBUtils.TokenType.BeginMarker || tokType == IOBUtils.TokenType.NoMarker); } currentWord += token; lastToken = token; } FillInWordStatistics(iobList, currentWord, wordStartIndex); return(iobList); }
/// <summary>Convert token to a sequence of datums and add to iobList.</summary> /// <param name="iobList"/> /// <param name="token"/> /// <param name="tokType"/> /// <param name="tokenLabel"/> /// <param name="lastToken"/> /// <param name="applyRewriteRules"/> /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param> /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param> private static void TokenToDatums(IList <CoreLabel> iobList, CoreLabel cl, string token, IOBUtils.TokenType tokType, CoreLabel tokenLabel, string lastToken, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText ) { if (token.IsEmpty()) { return; } string lastLabel = ContinuationSymbol; string firstLabel = BeginSymbol; string rewritten = cl.Get(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation)); bool crossRefRewrites = true; if (rewritten == null) { rewritten = token; crossRefRewrites = false; } else { rewritten = StripSegmentationMarkers(rewritten, tokType); } if (applyRewriteRules) { // Apply Arabic-specific re-write rules string rawToken = tokenLabel.Word(); string tag = tokenLabel.Tag(); MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification(); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Ngen); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Nnum); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Def); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Tense); MorphoFeatures features = featureSpec.StrToFeatures(tag); // Rule #1 : ت --> ة if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Ngen).Equals("F") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Nnum).Equals("SG") && rawToken.EndsWith("ت-") && !stripRewrites) { lastLabel = RewriteSymbol; } else { if (rawToken.EndsWith("ة-")) { System.Diagnostics.Debug.Assert(token.EndsWith("ة")); token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ت"; lastLabel = RewriteSymbol; } } // Rule #2 : لل --> ل ال if (lastToken.Equals("ل") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Def).Equals("D")) { if (rawToken.StartsWith("-ال")) { if (!token.StartsWith("ا")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } token = Sharpen.Runtime.Substring(token, 1); rewritten = Sharpen.Runtime.Substring(rewritten, 1); if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { if (rawToken.StartsWith("-ل")) { if (!token.StartsWith("ل")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { log.Info("Ignoring REWAL: " + rawToken + " / " + token); } } } // Rule #3 : ي --> ى // Rule #4 : ا --> ى if (rawToken.EndsWith("ى-")) { if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Tense) != null) { // verb: ى becomes ا token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ا"; } else { // assume preposition: token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ي"; } if (!stripRewrites) { lastLabel = RewriteSymbol; } } else { if (rawToken.Equals("علي-") || rawToken.Equals("-علي-")) { if (!stripRewrites) { lastLabel = RewriteSymbol; } } } } string origWord; if (origText == null) { origWord = tokenLabel.Word(); } else { origWord = Sharpen.Runtime.Substring(origText, cl.BeginPosition(), cl.EndPosition()); } int origIndex = 0; while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } // Create datums and add to iobList if (token.IsEmpty()) { log.Info("Rewriting resulted in empty token: " + tokenLabel.Word()); } string firstChar = token[0].ToString(); // Start at 0 to make sure we include the whole token according to the tokenizer iobList.Add(CreateDatum(cl, firstChar, firstLabel, 0, origIndex + 1)); int numChars = token.Length; if (crossRefRewrites && rewritten.Length != numChars) { System.Console.Error.Printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten); crossRefRewrites = false; } ++origIndex; for (int j = 1; j < numChars; ++j, ++origIndex) { while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } if (origIndex >= origWord.Length) { origIndex = origWord.Length - 1; } string charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol; string thisChar = token[j].ToString(); if (crossRefRewrites && !rewritten[j].ToString().Equals(thisChar)) { charLabel = RewriteSymbol; } if (charLabel == ContinuationSymbol && thisChar.Equals("ى") && j != numChars - 1) { charLabel = RewriteSymbol; } // Assume all mid-word alef maqsura are supposed to be yah iobList.Add(CreateDatum(cl, thisChar, charLabel, origIndex, origIndex + 1)); } // End at endPosition to make sure we include the whole token according to the tokenizer if (!iobList.IsEmpty()) { iobList[iobList.Count - 1].SetEndPosition(cl.EndPosition()); } }