예제 #1
0
        public (int TP, int FN, int FP) TrainOnSentence(ISpan span, ref int[] spanTags, Span <float> ScoreBuffer, Span <int> features, bool updateModel = true)
        {
            //for training, we expect the tokens to have [BILOU]-[Type] entries as the only EntityType
            IToken prev = SpecialToken.BeginToken; IToken prev2 = SpecialToken.BeginToken; IToken curr = SpecialToken.BeginToken; IToken next = SpecialToken.BeginToken; IToken next2 = SpecialToken.BeginToken;
            int    prevTag = IndexTagOutside; int prev2Tag = IndexTagOutside; int currTag = IndexTagOutside;

            int i = 0, correct = 0;

            int TP = 0, FN = 0, FP = 0;

            var en = span.GetEnumerator();

            while (next != SpecialToken.EndToken)
            {
                prev2 = prev; prev = curr; curr = next; next = next2; prev2Tag = prevTag; prevTag = currTag;
                if (en.MoveNext())
                {
                    next2 = en.Current;
                }
                else
                {
                    next2 = SpecialToken.EndToken;
                }
                if (!(curr is SpecialToken))
                {
                    int tokenTag = spanTags[i];

                    GetFeatures(features, curr, prev, prev2, next, next2, prevTag, prev2Tag);

                    currTag = PredictTagFromFeatures(features, ScoreBuffer);

                    if (updateModel)
                    {
                        UpdateModel(tokenTag, currTag, features);
                    }

                    if (tokenTag != IndexTagOutside && currTag == tokenTag)
                    {
                        correct++;
                    }

                    if (tokenTag != IndexTagOutside && currTag == tokenTag)
                    {
                        TP++;
                    }
                    if (tokenTag == IndexTagOutside && currTag != tokenTag)
                    {
                        FP++;
                    }
                    if (tokenTag != IndexTagOutside && currTag != tokenTag)
                    {
                        FN++;
                    }
                    i++;
                }
            }

            return(TP, FN, FP);
        }
        private bool RecognizeEntities(ISpan span)
        {
            IToken prev = SpecialToken.BeginToken; IToken prev2 = SpecialToken.BeginToken; IToken curr = SpecialToken.BeginToken; IToken next = SpecialToken.BeginToken; IToken next2 = SpecialToken.BeginToken;
            bool   prevH = false, prev2H = false, currH = false, nextH = false, next2H = false;
            bool   prevP = false, prev2P = false, currP = false, nextP = false, next2P = false;
            bool   foundAny = false;

            var en = span.GetEnumerator();

            while (next2 != SpecialToken.EndToken)
            {
                prev2  = prev; prev = curr; curr = next; next = next2;
                prev2H = prevH; prevH = currH; currH = nextH; nextH = next2H;
                prev2P = prevP; prevP = currP; currP = nextP; nextP = next2P;
                if (en.MoveNext())
                {
                    next2 = en.Current; next2H = next2.ValueAsSpan.IsHyphen(); next2P = next2.ValueAsSpan.IsAnyPunctuation();
                }
                else
                {
                    next2 = SpecialToken.EndToken; next2H = false;
                }
                if (prev != SpecialToken.BeginToken)
                {
                    if (!prevH && currH && !nextH && !prevP && !nextP) // pattern: word <hyphen> word, where word != punctuation
                    {
                        int ix = prev.EntityTypes.FindIndex(0, et => et.Type == HyphenatedTag);
                        if (ix > -1)
                        {
                            var newET = new EntityType(HyphenatedTag, EntityTag.Inside);
                            prev.UpdateEntityType(ix, ref newET);
                        }
                        else
                        {
                            prev.AddEntityType(new EntityType(HyphenatedTag, EntityTag.Begin));
                        }

                        curr.AddEntityType(new EntityType(HyphenatedTag, EntityTag.Inside));
                        next.AddEntityType(new EntityType(HyphenatedTag, EntityTag.End));
                        foundAny = true;
                    }
                }
            }
            return(foundAny);
        }
예제 #3
0
        public bool Predict(ISpan span, Span <float> ScoreBuffer, Span <int> features)
        {
            IToken prev = SpecialToken.BeginToken; IToken prev2 = SpecialToken.BeginToken; IToken curr = SpecialToken.BeginToken; IToken next = SpecialToken.BeginToken; IToken next2 = SpecialToken.BeginToken;
            int    prevTag = IndexTagOutside; int prev2Tag = IndexTagOutside; int currTag = IndexTagOutside;
            bool   foundAny = false;
            int    i        = 0;

            var en = span.GetEnumerator();

            var tags = new int[span.TokensCount];

            while (next != SpecialToken.EndToken)
            {
                prev2 = prev; prev = curr; curr = next; next = next2; prev2Tag = prevTag; prevTag = currTag;
                if (en.MoveNext())
                {
                    next2 = en.Current;
                }
                else
                {
                    next2 = SpecialToken.EndToken;
                }

                if (curr != SpecialToken.BeginToken)
                {
                    GetFeatures(features, curr, prev, prev2, next, next2, prevTag, prev2Tag);
                    tags[i] = PredictTagFromFeatures(features, ScoreBuffer);
                    currTag = tags[i];
                    i++;
                }
            }

            string lastBegin = null;

            for (i = 0; i < span.TokensCount; i++)
            {
                if (tags[i] != IndexTagOutside)
                {
                    var type = Data.IndexToEntityType[tags[i]];
                    var tag  = Data.IndexToEntityTag[tags[i]];

                    bool valid = tag == EntityTag.Single; //Single is always valid

                    if (tag == EntityTag.Begin)           //Checks if it's a valid combination of tags - i.e. B+I+E or B+E
                    {
                        for (int j = i + 1; j < span.TokensCount; j++)
                        {
                            var other_tag = Data.IndexToEntityTag[tags[i]];

                            if (other_tag != EntityTag.Inside || other_tag != EntityTag.End)
                            {
                                break;
                            }

                            var other_type = Data.IndexToEntityType[tags[i]];

                            if (other_type != type)
                            {
                                break;
                            }

                            if (other_tag == EntityTag.End)
                            {
                                valid = true; break;
                            }                                                        //found the right tag and right type by now
                        }
                    }
                    else if (tag == EntityTag.Inside || tag == EntityTag.End)
                    {
                        valid = type == lastBegin;
                    }

                    if (valid)
                    {
                        if (tag == EntityTag.Begin)
                        {
                            lastBegin = type;
                        }
                        if (tag == EntityTag.End)
                        {
                            lastBegin = null;
                        }

                        span[i].AddEntityType(new EntityType(type, tag));
                        foundAny = true;
                    }
                }
            }
            return(foundAny);
        }