Ejemplo n.º 1
0
        // static methods
        /// <summary>
        /// This can be used to map from any IOB-style (i.e., "I-PERS" style labels)
        /// or just categories representation to any other.
        /// </summary>
        /// <remarks>
        /// This can be used to map from any IOB-style (i.e., "I-PERS" style labels)
        /// or just categories representation to any other.
        /// It can read and change any representation to other representations:
        /// a 4 way representation of all entities, like S-PERS, B-PERS,
        /// I-PERS, E-PERS for single word, beginning, internal, and end of entity
        /// (IOBES or SBIEO); always marking the first word of an entity (IOB2 or BIO);
        /// only marking specially the beginning of non-first
        /// items of an entity sequences with B-PERS (IOB1);
        /// the reverse IOE1 and IOE2; IO where everything is I-tagged; and
        /// NOPREFIX, where no prefixes are written on category labels.
        /// The last two representations are deficient in not allowing adjacent
        /// entities of the same class to be represented, but nevertheless
        /// convenient.  Note that the background label is never given a prefix.
        /// This code is very specific to the particular CoNLL way of labeling
        /// classes for IOB-style encoding, but this notation is quite widespread.
        /// It will work on any of these styles of input.
        /// This will also recognize BILOU format (B=B, I=I, L=E, O=O, U=S).
        /// It also works with lowercased names like i-org.
        /// If the labels are not of the form "C-Y+", where C is a single character,
        /// then they will be regarded as NOPREFIX labels.
        /// This method updates the List tokens in place.
        /// </remarks>
        /// <param name="tokens">List of tokens (each a CoreLabel) in some style</param>
        /// <param name="key">The key in the CoreLabel to change, commonly CoreAnnotations.AnswerAnnotation.class</param>
        /// <param name="backgroundLabel">The background label, which gets special treatment</param>
        /// <param name="style">Output style; one of iob[12], ioe[12], io, sbieo/iobes, noprefix</param>
        /// <param name="intern">Whether to String-intern the new labels (may as well, small number!)</param>
        public static void EntitySubclassify <Tok>(IList <TOK> tokens, Type key, string backgroundLabel, string style, bool intern)
            where Tok : ICoreMap
        {
            int    how;
            string lowerStyle = style.ToLower(Locale.English);

            switch (lowerStyle)
            {
            case "iob1":
            {
                how = 0;
                break;
            }

            case "iob2":
            case "bio":
            {
                how = 1;
                break;
            }

            case "ioe1":
            {
                how = 2;
                break;
            }

            case "ioe2":
            {
                how = 3;
                break;
            }

            case "io":
            {
                how = 4;
                break;
            }

            case "sbieo":
            case "iobes":
            {
                how = 5;
                break;
            }

            case "noprefix":
            {
                how = 6;
                break;
            }

            case "bilou":
            {
                how = 7;
                break;
            }

            default:
            {
                throw new ArgumentException("entitySubclassify: unknown style: " + style);
            }
            }
            IList <TOK> paddedTokens = new PaddedList <TOK>(tokens, (TOK) new CoreLabel());
            int         size         = paddedTokens.Count;

            string[] newAnswers = new string[size];
            for (int i = 0; i < size; i++)
            {
                TOK    c    = paddedTokens[i];
                TOK    p    = paddedTokens[i - 1];
                TOK    n    = paddedTokens[i + 1];
                string cAns = c.Get(key);
                string pAns = p.Get(key);
                if (pAns == null)
                {
                    pAns = backgroundLabel;
                }
                string nAns = n.Get(key);
                if (nAns == null)
                {
                    nAns = backgroundLabel;
                }
                string @base;
                char   prefix;
                if (cAns.Length > 2 && cAns[1] == '-')
                {
                    @base  = Sharpen.Runtime.Substring(cAns, 2, cAns.Length);
                    prefix = char.ToUpperCase(cAns[0]);
                }
                else
                {
                    @base  = cAns;
                    prefix = ' ';
                }
                string pBase;
                char   pPrefix;
                if (pAns.Length > 2 && pAns[1] == '-')
                {
                    pBase   = Sharpen.Runtime.Substring(pAns, 2, pAns.Length);
                    pPrefix = char.ToUpperCase(pAns[0]);
                }
                else
                {
                    pBase   = pAns;
                    pPrefix = ' ';
                }
                string nBase;
                char   nPrefix;
                if (nAns.Length > 2 && nAns[1] == '-')
                {
                    nBase   = Sharpen.Runtime.Substring(nAns, 2, nAns.Length);
                    nPrefix = char.ToUpperCase(nAns[0]);
                }
                else
                {
                    nBase   = nAns;
                    nPrefix = ' ';
                }
                bool   isStartAdjacentSame = IsSameEntityBoundary(pBase, pPrefix, @base, prefix);
                bool   isEndAdjacentSame   = IsSameEntityBoundary(@base, prefix, nBase, nPrefix);
                bool   isFirst             = IsDifferentEntityBoundary(pBase, @base) || isStartAdjacentSame;
                bool   isLast    = IsDifferentEntityBoundary(@base, nBase) || isEndAdjacentSame;
                string newAnswer = @base;
                if ([email protected](backgroundLabel))
                {
                    switch (how)
                    {
                    case 0:
                    {
                        // iob1, only B if adjacent
                        if (isStartAdjacentSame)
                        {
                            newAnswer = "B-" + @base;
                        }
                        else
                        {
                            newAnswer = "I-" + @base;
                        }
                        break;
                    }

                    case 1:
                    {
                        // iob2 always B at start
                        if (isFirst)
                        {
                            newAnswer = "B-" + @base;
                        }
                        else
                        {
                            newAnswer = "I-" + @base;
                        }
                        break;
                    }

                    case 2:
                    {
                        // ioe1
                        if (isEndAdjacentSame)
                        {
                            newAnswer = "E-" + @base;
                        }
                        else
                        {
                            newAnswer = "I-" + @base;
                        }
                        break;
                    }

                    case 3:
                    {
                        // ioe2
                        if (isLast)
                        {
                            newAnswer = "E-" + @base;
                        }
                        else
                        {
                            newAnswer = "I-" + @base;
                        }
                        break;
                    }

                    case 4:
                    {
                        newAnswer = "I-" + @base;
                        break;
                    }

                    case 5:
                    {
                        if (isFirst && isLast)
                        {
                            newAnswer = "S-" + @base;
                        }
                        else
                        {
                            if ((!isFirst) && isLast)
                            {
                                newAnswer = "E-" + @base;
                            }
                            else
                            {
                                if (isFirst && (!isLast))
                                {
                                    newAnswer = "B-" + @base;
                                }
                                else
                                {
                                    newAnswer = "I-" + @base;
                                }
                            }
                        }
                        break;
                    }

                    case 7:
                    {
                        // nothing to do on case 6 as it's just base
                        if (isFirst && isLast)
                        {
                            newAnswer = "U-" + @base;
                        }
                        else
                        {
                            if ((!isFirst) && isLast)
                            {
                                newAnswer = "L-" + @base;
                            }
                            else
                            {
                                if (isFirst && (!isLast))
                                {
                                    newAnswer = "B-" + @base;
                                }
                                else
                                {
                                    newAnswer = "I-" + @base;
                                }
                            }
                        }
                        break;
                    }
                    }
                }
                if (intern)
                {
                    newAnswer = string.Intern(newAnswer);
                }
                newAnswers[i] = newAnswer;
            }
            for (int i_1 = 0; i_1 < size; i_1++)
            {
                TOK c = tokens[i_1];
                c.Set(typeof(CoreAnnotations.AnswerAnnotation), newAnswers[i_1]);
            }
        }