/// <summary> /// extracts the entity starting at the given position /// and adds it to the entity list. /// </summary> /// <remarks> /// extracts the entity starting at the given position /// and adds it to the entity list. returns the index /// of the last element in the entity (<b>not</b> index+1) /// </remarks> public virtual EntityBIO ExtractEntity(int[] sequence, int position, string tag) { EntityBIO entity = new EntityBIO(); entity.type = tagIndex.IndexOf(tag); entity.startPosition = position; entity.words = new List <string>(); entity.words.Add(wordDoc[position]); int pos = position + 1; for (; pos < sequence.Length; pos++) { string rawTag = classIndex.Get(sequence[pos]); string[] parts = rawTag.Split("-"); if (parts[0].Equals("I") && parts[1].Equals(tag)) { string word = wordDoc[pos]; entity.words.Add(word); } else { break; } } entity.otherOccurrences = OtherOccurrences(entity); return(entity); }
private void AddEntityToEntitiesArray(EntityBIO entity) { for (int j = entity.startPosition; j < entity.startPosition + entity.words.Count; j++) { entities[j] = entity; } }
/// <summary> /// finds other locations in the sequence where the sequence of /// words in this entity occurs. /// </summary> public virtual int[] OtherOccurrences(EntityBIO entity) { IList <int> other = new List <int>(); for (int i = 0; i < wordDoc.Count; i++) { if (i == entity.startPosition) { continue; } if (Matches(entity, i)) { other.Add(int.Parse(i)); } } return(ToArray(other)); }
public virtual bool Matches(EntityBIO entity, int position) { string word = wordDoc[position]; if (Sharpen.Runtime.EqualsIgnoreCase(word, entity.words[0])) { for (int j = 1; j < entity.words.Count; j++) { if (position + j >= wordDoc.Count) { return(false); } string nextWord = wordDoc[position + j]; if (!Sharpen.Runtime.EqualsIgnoreCase(nextWord, entity.words[j])) { return(false); } } return(true); } return(false); }
public virtual void SetInitialSequence(int[] initialSequence) { this.sequence = initialSequence; entities = new EntityBIO[initialSequence.Length]; // Arrays.fill(entities, null); // not needed; Java arrays zero initialized for (int i = 0; i < initialSequence.Length; i++) { if (initialSequence[i] != backgroundSymbol) { string rawTag = classIndex.Get(sequence[i]); string[] parts = rawTag.Split("-"); //TODO(mengqiu) this needs to be updated, so that initial can be I as well if (parts[0].Equals("B")) { // B- EntityBIO entity = ExtractEntity(initialSequence, i, parts[1]); AddEntityToEntitiesArray(entity); i += entity.words.Count - 1; } } } }
public virtual void UpdateSequenceElement(int[] sequence, int position, int oldVal) { this.sequence = sequence; if (sequence[position] == oldVal) { return; } if (Verbose) { log.Info("changing position " + position + " from " + classIndex.Get(oldVal) + " to " + classIndex.Get(sequence[position])); } if (sequence[position] == backgroundSymbol) { // new tag is O string oldRawTag = classIndex.Get(oldVal); string[] oldParts = oldRawTag.Split("-"); if (oldParts[0].Equals("B")) { // old tag was a B, current entity definitely affected, also check next one EntityBIO entity = entities[position]; if (entity == null) { throw new Exception("oldTag starts with B, entity at position should not be null"); } // remove entities for all words affected by this entity for (int i = 0; i < entity.words.Count; i++) { entities[position + i] = null; } } else { // old tag was a I, check previous one if (entities[position] != null) { // this was part of an entity, shortened if (Verbose) { log.Info("splitting off prev entity"); } EntityBIO oldEntity = entities[position]; int oldLen = oldEntity.words.Count; int offset = position - oldEntity.startPosition; IList <string> newWords = new List <string>(); for (int i = 0; i < offset; i++) { newWords.Add(oldEntity.words[i]); } oldEntity.words = newWords; oldEntity.otherOccurrences = OtherOccurrences(oldEntity); // need to clean any remaining entity for (int i_1 = 0; i_1 < oldLen - offset; i_1++) { entities[position + i_1] = null; } if (Verbose && position > 0) { log.Info("position:" + position + ", entities[position-1] = " + entities[position - 1].ToString(tagIndex)); } } } } else { // otherwise, non-entity part I-xxx -> O, no enitty affected string rawTag = classIndex.Get(sequence[position]); string[] parts = rawTag.Split("-"); if (parts[0].Equals("B")) { // new tag is B if (oldVal == backgroundSymbol) { // start a new entity, may merge with the next word EntityBIO entity = ExtractEntity(sequence, position, parts[1]); AddEntityToEntitiesArray(entity); } else { string oldRawTag = classIndex.Get(oldVal); string[] oldParts = oldRawTag.Split("-"); if (oldParts[0].Equals("B")) { // was a different B-xxx EntityBIO oldEntity = entities[position]; if (oldEntity.words.Count > 1) { // remove all old entity, add new singleton for (int i = 0; i < oldEntity.words.Count; i++) { entities[position + i] = null; } EntityBIO entity = ExtractEntity(sequence, position, parts[1]); AddEntityToEntitiesArray(entity); } else { // extract entity EntityBIO entity = ExtractEntity(sequence, position, parts[1]); AddEntityToEntitiesArray(entity); } } else { // was I EntityBIO oldEntity = entities[position]; if (oldEntity != null) { // break old entity int oldLen = oldEntity.words.Count; int offset = position - oldEntity.startPosition; IList <string> newWords = new List <string>(); for (int i = 0; i < offset; i++) { newWords.Add(oldEntity.words[i]); } oldEntity.words = newWords; oldEntity.otherOccurrences = OtherOccurrences(oldEntity); // need to clean any remaining entity for (int i_1 = 0; i_1 < oldLen - offset; i_1++) { entities[position + i_1] = null; } } EntityBIO entity = ExtractEntity(sequence, position, parts[1]); AddEntityToEntitiesArray(entity); } } } else { // new tag is I if (oldVal == backgroundSymbol) { // check if previous entity extends into this one if (position > 0) { if (entities[position - 1] != null) { string oldTag = tagIndex.Get(entities[position - 1].type); EntityBIO entity = ExtractEntity(sequence, position - 1 - entities[position - 1].words.Count + 1, oldTag); AddEntityToEntitiesArray(entity); } } } else { string oldRawTag = classIndex.Get(oldVal); string[] oldParts = oldRawTag.Split("-"); if (oldParts[0].Equals("B")) { // was a B, clean the B entity first, then check if previous is an entity EntityBIO oldEntity = entities[position]; for (int i = 0; i < oldEntity.words.Count; i++) { entities[position + i] = null; } if (position > 0) { if (entities[position - 1] != null) { string oldTag = tagIndex.Get(entities[position - 1].type); if (Verbose) { log.Info("position:" + position + ", entities[position-1] = " + entities[position - 1].ToString(tagIndex)); } EntityBIO entity = ExtractEntity(sequence, position - 1 - entities[position - 1].words.Count + 1, oldTag); AddEntityToEntitiesArray(entity); } } } else { // was a differnt I-xxx, if (entities[position] != null) { // shorten the previous one, remove any additional parts EntityBIO oldEntity = entities[position]; int oldLen = oldEntity.words.Count; int offset = position - oldEntity.startPosition; IList <string> newWords = new List <string>(); for (int i = 0; i < offset; i++) { newWords.Add(oldEntity.words[i]); } oldEntity.words = newWords; oldEntity.otherOccurrences = OtherOccurrences(oldEntity); // need to clean any remaining entity for (int i_1 = 0; i_1 < oldLen - offset; i_1++) { entities[position + i_1] = null; } } else { // re-calc entity of the previous entity if exist if (position > 0) { if (entities[position - 1] != null) { string oldTag = tagIndex.Get(entities[position - 1].type); EntityBIO entity = ExtractEntity(sequence, position - 1 - entities[position - 1].words.Count + 1, oldTag); AddEntityToEntitiesArray(entity); } } } } } } } }
public override double ScoreOf(int[] sequence) { double p = 0.0; for (int i = 0; i < entities.Length; i++) { EntityBIO entity = entities[i]; if ((i == 0 || entities[i - 1] != entity) && entity != null) { int length = entity.words.Count; int tag1 = entity.type; // String tag1 = classIndex.get(entity.type); int[] other = entities[i].otherOccurrences; foreach (int otherOccurrence in other) { EntityBIO otherEntity = null; for (int k = otherOccurrence; k < otherOccurrence + length && k < entities.Length; k++) { otherEntity = entities[k]; if (otherEntity != null) { break; } } // singleton + other instance null? if (otherEntity == null) { continue; } int oLength = otherEntity.words.Count; // String tag2 = classIndex.get(otherEntity.type); int tag2 = otherEntity.type; // exact match?? bool exact = false; int[] oOther = otherEntity.otherOccurrences; foreach (int index in oOther) { if (index >= i && index <= i + length - 1) { exact = true; break; } } double factor; // initialized in 2 cases below if (exact) { if (Debug) { log.Info("Exact match of tag1=" + tagIndex.Get(tag1) + ", tag2=" + tagIndex.Get(tag2)); } // entity not complete if (length != oLength) { // if (DEBUG) // log.info("Entity Not Complete"); if (tag1 == tag2) { p += Math.Abs(oLength - length) * p1; } else { if (!(tag1 == ORGIndex && tag2 == LOCIndex) && !(tag1 == LOCIndex && tag2 == ORGIndex)) { // shorter p += (oLength + length) * p1; } } } factor = entityMatrix[tag1][tag2]; } else { if (Debug) { log.Info("Sub match of tag1=" + tagIndex.Get(tag1) + ", tag2=" + tagIndex.Get(tag2)); } factor = subEntityMatrix[tag1][tag2]; } if (tag1 == tag2) { if (flags.matchNERIncentive) { factor = p2; } else { // factor *= -1; factor = 0; } } if (Debug) { log.Info(" of factor=" + factor + ", p += " + (length * factor)); } p += length * factor; } } } return(p); }