public (int TP, int FN, int FP) TrainOnSentence(ISpan span, ref int[] spanTags, Span <float> ScoreBuffer, Span <int> features, bool updateModel = true) { //for training, we expect the tokens to have [BILOU]-[Type] entries as the only EntityType IToken prev = SpecialToken.BeginToken; IToken prev2 = SpecialToken.BeginToken; IToken curr = SpecialToken.BeginToken; IToken next = SpecialToken.BeginToken; IToken next2 = SpecialToken.BeginToken; int prevTag = IndexTagOutside; int prev2Tag = IndexTagOutside; int currTag = IndexTagOutside; int i = 0, correct = 0; int TP = 0, FN = 0, FP = 0; var en = span.GetEnumerator(); while (next != SpecialToken.EndToken) { prev2 = prev; prev = curr; curr = next; next = next2; prev2Tag = prevTag; prevTag = currTag; if (en.MoveNext()) { next2 = en.Current; } else { next2 = SpecialToken.EndToken; } if (!(curr is SpecialToken)) { int tokenTag = spanTags[i]; GetFeatures(features, curr, prev, prev2, next, next2, prevTag, prev2Tag); currTag = PredictTagFromFeatures(features, ScoreBuffer); if (updateModel) { UpdateModel(tokenTag, currTag, features); } if (tokenTag != IndexTagOutside && currTag == tokenTag) { correct++; } if (tokenTag != IndexTagOutside && currTag == tokenTag) { TP++; } if (tokenTag == IndexTagOutside && currTag != tokenTag) { FP++; } if (tokenTag != IndexTagOutside && currTag != tokenTag) { FN++; } i++; } } return(TP, FN, FP); }
private bool RecognizeEntities(ISpan span) { IToken prev = SpecialToken.BeginToken; IToken prev2 = SpecialToken.BeginToken; IToken curr = SpecialToken.BeginToken; IToken next = SpecialToken.BeginToken; IToken next2 = SpecialToken.BeginToken; bool prevH = false, prev2H = false, currH = false, nextH = false, next2H = false; bool prevP = false, prev2P = false, currP = false, nextP = false, next2P = false; bool foundAny = false; var en = span.GetEnumerator(); while (next2 != SpecialToken.EndToken) { prev2 = prev; prev = curr; curr = next; next = next2; prev2H = prevH; prevH = currH; currH = nextH; nextH = next2H; prev2P = prevP; prevP = currP; currP = nextP; nextP = next2P; if (en.MoveNext()) { next2 = en.Current; next2H = next2.ValueAsSpan.IsHyphen(); next2P = next2.ValueAsSpan.IsAnyPunctuation(); } else { next2 = SpecialToken.EndToken; next2H = false; } if (prev != SpecialToken.BeginToken) { if (!prevH && currH && !nextH && !prevP && !nextP) // pattern: word <hyphen> word, where word != punctuation { int ix = prev.EntityTypes.FindIndex(0, et => et.Type == HyphenatedTag); if (ix > -1) { var newET = new EntityType(HyphenatedTag, EntityTag.Inside); prev.UpdateEntityType(ix, ref newET); } else { prev.AddEntityType(new EntityType(HyphenatedTag, EntityTag.Begin)); } curr.AddEntityType(new EntityType(HyphenatedTag, EntityTag.Inside)); next.AddEntityType(new EntityType(HyphenatedTag, EntityTag.End)); foundAny = true; } } } return(foundAny); }
public bool Predict(ISpan span, Span <float> ScoreBuffer, Span <int> features) { IToken prev = SpecialToken.BeginToken; IToken prev2 = SpecialToken.BeginToken; IToken curr = SpecialToken.BeginToken; IToken next = SpecialToken.BeginToken; IToken next2 = SpecialToken.BeginToken; int prevTag = IndexTagOutside; int prev2Tag = IndexTagOutside; int currTag = IndexTagOutside; bool foundAny = false; int i = 0; var en = span.GetEnumerator(); var tags = new int[span.TokensCount]; while (next != SpecialToken.EndToken) { prev2 = prev; prev = curr; curr = next; next = next2; prev2Tag = prevTag; prevTag = currTag; if (en.MoveNext()) { next2 = en.Current; } else { next2 = SpecialToken.EndToken; } if (curr != SpecialToken.BeginToken) { GetFeatures(features, curr, prev, prev2, next, next2, prevTag, prev2Tag); tags[i] = PredictTagFromFeatures(features, ScoreBuffer); currTag = tags[i]; i++; } } string lastBegin = null; for (i = 0; i < span.TokensCount; i++) { if (tags[i] != IndexTagOutside) { var type = Data.IndexToEntityType[tags[i]]; var tag = Data.IndexToEntityTag[tags[i]]; bool valid = tag == EntityTag.Single; //Single is always valid if (tag == EntityTag.Begin) //Checks if it's a valid combination of tags - i.e. B+I+E or B+E { for (int j = i + 1; j < span.TokensCount; j++) { var other_tag = Data.IndexToEntityTag[tags[i]]; if (other_tag != EntityTag.Inside || other_tag != EntityTag.End) { break; } var other_type = Data.IndexToEntityType[tags[i]]; if (other_type != type) { break; } if (other_tag == EntityTag.End) { valid = true; break; } //found the right tag and right type by now } } else if (tag == EntityTag.Inside || tag == EntityTag.End) { valid = type == lastBegin; } if (valid) { if (tag == EntityTag.Begin) { lastBegin = type; } if (tag == EntityTag.End) { lastBegin = null; } span[i].AddEntityType(new EntityType(type, tag)); foundAny = true; } } } return(foundAny); }