예제 #1
0
        /// <summary>Deterministically classify a token.</summary>
        private static IOBUtils.TokenType GetTokenType(string token, string segMarker)
        {
            if (segMarker == null || token.Equals(segMarker))
            {
                return(IOBUtils.TokenType.NoMarker);
            }
            IOBUtils.TokenType tokType = IOBUtils.TokenType.NoMarker;
            bool startsWithMarker      = token.StartsWith(segMarker);
            bool endsWithMarker        = token.EndsWith(segMarker);

            if (startsWithMarker && endsWithMarker)
            {
                tokType = IOBUtils.TokenType.BothMarker;
            }
            else
            {
                if (startsWithMarker)
                {
                    tokType = IOBUtils.TokenType.BeginMarker;
                }
                else
                {
                    if (endsWithMarker)
                    {
                        tokType = IOBUtils.TokenType.EndMarker;
                    }
                }
            }
            return(tokType);
        }
예제 #2
0
        /// <summary>Strip segmentation markers.</summary>
        private static string StripSegmentationMarkers(string tok, IOBUtils.TokenType tokType)
        {
            int beginOffset = (tokType == IOBUtils.TokenType.BeginMarker || tokType == IOBUtils.TokenType.BothMarker) ? 1 : 0;
            int endOffset   = (tokType == IOBUtils.TokenType.EndMarker || tokType == IOBUtils.TokenType.BothMarker) ? tok.Length - 1 : tok.Length;

            return(tokType == IOBUtils.TokenType.NoMarker ? tok : Sharpen.Runtime.Substring(tok, beginOffset, endOffset));
        }
예제 #3
0
        /// <summary>
        /// Convert a String to a list of characters suitable for labeling in an IOB
        /// segmentation model.
        /// </summary>
        /// <param name="tokenList"/>
        /// <param name="segMarker"/>
        /// <param name="applyRewriteRules">add rewrite labels (for training data)</param>
        /// <param name="stripRewrites">
        /// revert training data to old Green and DeNero model (remove
        /// rewrite labels but still rewrite to try to preserve raw text)
        /// </param>
        /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param>
        /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param>
        public static IList <CoreLabel> StringToIOB(IList <CoreLabel> tokenList, char segMarker, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText)
        {
            IList <CoreLabel> iobList      = new List <CoreLabel>(tokenList.Count * 7 + tokenList.Count);
            string            strSegMarker = segMarker.ToString();
            bool   addWhitespace           = false;
            int    numTokens      = tokenList.Count;
            string lastToken      = string.Empty;
            string currentWord    = string.Empty;
            int    wordStartIndex = 0;

            foreach (CoreLabel cl in tokenList)
            {
                // What type of token is this
                if (addWhitespace)
                {
                    FillInWordStatistics(iobList, currentWord, wordStartIndex);
                    currentWord    = string.Empty;
                    wordStartIndex = iobList.Count + 1;
                    iobList.Add(CreateDatum(cl, BoundaryChar, BoundarySymbol));
                    CoreLabel boundaryDatum = iobList[iobList.Count - 1];
                    boundaryDatum.SetIndex(0);
                    boundaryDatum.SetWord(string.Empty);
                    addWhitespace = false;
                }
                string             token   = cl.Word();
                IOBUtils.TokenType tokType = GetTokenType(token, strSegMarker);
                token = StripSegmentationMarkers(token, tokType);
                System.Diagnostics.Debug.Assert(token.Length != 0);
                if (ShouldNotSegment(token))
                {
                    iobList.Add(CreateDatum(cl, token, NosegSymbol));
                    addWhitespace = true;
                }
                else
                {
                    // Iterate over the characters in the token
                    TokenToDatums(iobList, cl, token, tokType, cl, lastToken, applyRewriteRules, stripRewrites, tf, origText);
                    addWhitespace = (tokType == IOBUtils.TokenType.BeginMarker || tokType == IOBUtils.TokenType.NoMarker);
                }
                currentWord += token;
                lastToken    = token;
            }
            FillInWordStatistics(iobList, currentWord, wordStartIndex);
            return(iobList);
        }
예제 #4
0
        /// <summary>Convert token to a sequence of datums and add to iobList.</summary>
        /// <param name="iobList"/>
        /// <param name="token"/>
        /// <param name="tokType"/>
        /// <param name="tokenLabel"/>
        /// <param name="lastToken"/>
        /// <param name="applyRewriteRules"/>
        /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param>
        /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param>
        private static void TokenToDatums(IList <CoreLabel> iobList, CoreLabel cl, string token, IOBUtils.TokenType tokType, CoreLabel tokenLabel, string lastToken, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText
                                          )
        {
            if (token.IsEmpty())
            {
                return;
            }
            string lastLabel        = ContinuationSymbol;
            string firstLabel       = BeginSymbol;
            string rewritten        = cl.Get(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation));
            bool   crossRefRewrites = true;

            if (rewritten == null)
            {
                rewritten        = token;
                crossRefRewrites = false;
            }
            else
            {
                rewritten = StripSegmentationMarkers(rewritten, tokType);
            }
            if (applyRewriteRules)
            {
                // Apply Arabic-specific re-write rules
                string rawToken = tokenLabel.Word();
                string tag      = tokenLabel.Tag();
                MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification();
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Ngen);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Nnum);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Def);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Tense);
                MorphoFeatures features = featureSpec.StrToFeatures(tag);
                // Rule #1 : ت --> ة
                if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Ngen).Equals("F") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Nnum).Equals("SG") && rawToken.EndsWith("ت-") && !stripRewrites)
                {
                    lastLabel = RewriteSymbol;
                }
                else
                {
                    if (rawToken.EndsWith("ة-"))
                    {
                        System.Diagnostics.Debug.Assert(token.EndsWith("ة"));
                        token     = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ت";
                        lastLabel = RewriteSymbol;
                    }
                }
                // Rule #2 : لل --> ل ال
                if (lastToken.Equals("ل") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Def).Equals("D"))
                {
                    if (rawToken.StartsWith("-ال"))
                    {
                        if (!token.StartsWith("ا"))
                        {
                            log.Info("Bad REWAL: " + rawToken + " / " + token);
                        }
                        token     = Sharpen.Runtime.Substring(token, 1);
                        rewritten = Sharpen.Runtime.Substring(rewritten, 1);
                        if (!stripRewrites)
                        {
                            firstLabel = RewriteSymbol;
                        }
                    }
                    else
                    {
                        if (rawToken.StartsWith("-ل"))
                        {
                            if (!token.StartsWith("ل"))
                            {
                                log.Info("Bad REWAL: " + rawToken + " / " + token);
                            }
                            if (!stripRewrites)
                            {
                                firstLabel = RewriteSymbol;
                            }
                        }
                        else
                        {
                            log.Info("Ignoring REWAL: " + rawToken + " / " + token);
                        }
                    }
                }
                // Rule #3 : ي --> ى
                // Rule #4 : ا --> ى
                if (rawToken.EndsWith("ى-"))
                {
                    if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Tense) != null)
                    {
                        // verb: ى becomes ا
                        token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ا";
                    }
                    else
                    {
                        // assume preposition:
                        token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ي";
                    }
                    if (!stripRewrites)
                    {
                        lastLabel = RewriteSymbol;
                    }
                }
                else
                {
                    if (rawToken.Equals("علي-") || rawToken.Equals("-علي-"))
                    {
                        if (!stripRewrites)
                        {
                            lastLabel = RewriteSymbol;
                        }
                    }
                }
            }
            string origWord;

            if (origText == null)
            {
                origWord = tokenLabel.Word();
            }
            else
            {
                origWord = Sharpen.Runtime.Substring(origText, cl.BeginPosition(), cl.EndPosition());
            }
            int origIndex = 0;

            while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf))
            {
                ++origIndex;
            }
            // Create datums and add to iobList
            if (token.IsEmpty())
            {
                log.Info("Rewriting resulted in empty token: " + tokenLabel.Word());
            }
            string firstChar = token[0].ToString();

            // Start at 0 to make sure we include the whole token according to the tokenizer
            iobList.Add(CreateDatum(cl, firstChar, firstLabel, 0, origIndex + 1));
            int numChars = token.Length;

            if (crossRefRewrites && rewritten.Length != numChars)
            {
                System.Console.Error.Printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten);
                crossRefRewrites = false;
            }
            ++origIndex;
            for (int j = 1; j < numChars; ++j, ++origIndex)
            {
                while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf))
                {
                    ++origIndex;
                }
                if (origIndex >= origWord.Length)
                {
                    origIndex = origWord.Length - 1;
                }
                string charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol;
                string thisChar  = token[j].ToString();
                if (crossRefRewrites && !rewritten[j].ToString().Equals(thisChar))
                {
                    charLabel = RewriteSymbol;
                }
                if (charLabel == ContinuationSymbol && thisChar.Equals("ى") && j != numChars - 1)
                {
                    charLabel = RewriteSymbol;
                }
                // Assume all mid-word alef maqsura are supposed to be yah
                iobList.Add(CreateDatum(cl, thisChar, charLabel, origIndex, origIndex + 1));
            }
            // End at endPosition to make sure we include the whole token according to the tokenizer
            if (!iobList.IsEmpty())
            {
                iobList[iobList.Count - 1].SetEndPosition(cl.EndPosition());
            }
        }