///<summary> ///Displays this parse using Penn Treebank-style formatting. ///</summary> public virtual string Show() { StringBuilder buffer = new StringBuilder(); int start = mSpan.Start; if (mType != MaximumEntropyParser.TokenNode) { buffer.Append("("); buffer.Append(mType + " "); } foreach (Parse childParse in mParts) { Util.Span childSpan = childParse.mSpan; if (start < childSpan.Start) { //System.Console.Out.WriteLine("pre " + start + " " + childSpan.Start); buffer.Append(mText.Substring(start, (childSpan.Start) - (start))); } buffer.Append(childParse.Show()); start = childSpan.End; } buffer.Append(mText.Substring(start, (mSpan.End) - (start))); if (mType != MaximumEntropyParser.TokenNode) { buffer.Append(")"); } return(buffer.ToString()); }
/// <summary> /// Tokenizes the string. /// </summary> /// <param name="input"> /// The string to be tokenized. /// </param> /// <returns> /// A span array containing individual tokens as elements. /// </returns> public virtual Util.Span[] TokenizePositions(string input) { if (mUnicodeMapping) { input = Utils.MapUnicodeChars(input); } Util.Span[] tokens = Split(input); mNewTokens.Clear(); mTokenProbabilities.Clear(); for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++) { Util.Span tokenSpan = tokens[currentToken]; string token = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start)); // Can't tokenize single characters if (token.Length < 2) { mNewTokens.Add(tokenSpan); mTokenProbabilities.Add(1.0); } else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token)) { mNewTokens.Add(tokenSpan); mTokenProbabilities.Add(1.0); } else { int startPosition = tokenSpan.Start; int endPosition = tokenSpan.End; int originalStart = tokenSpan.Start; double tokenProbability = 1.0; for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++) { //Console.Write("{0} {1}|{2} ({3})", currentPosition - originalStart - 1, token[currentPosition - originalStart - 1], token[currentPosition - originalStart], token); if (mAlphaNumericOptimization) { char leftChar = token[currentPosition - originalStart - 1]; char rightChar = token[currentPosition - originalStart]; if (char.IsLetterOrDigit(leftChar) && char.IsLetterOrDigit(rightChar)) /*Console.WriteLine();*/ continue { ; } } double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(new Util.Pair <string, int>(token, currentPosition - originalStart))); string bestOutcome = mModel.GetBestOutcome(probabilities); //Console.WriteLine(bestOutcome); tokenProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)]; if (bestOutcome == TokenContextGenerator.SplitIndicator) { mNewTokens.Add(new Util.Span(startPosition, currentPosition)); mTokenProbabilities.Add(tokenProbability); startPosition = currentPosition; tokenProbability = 1.0; } } mNewTokens.Add(new Util.Span(startPosition, endPosition)); mTokenProbabilities.Add(tokenProbability); } }
private void Show(Parse p, StringBuilder buffer) { int start = p.Span.Start; if (p.Type != MaximumEntropyParser.TokenNode) { buffer.Append("("); buffer.Append(p.Type); if (mParseMap.ContainsKey(p)) { buffer.Append("#" + mParseMap[p].ToString()); } buffer.Append(" "); } Parse[] children = p.GetChildren(); foreach (Parse c in children) { Util.Span s = c.Span; if (start < s.Start) { buffer.Append(p.Text.Substring(start, (s.Start) - (start))); } Show(c, buffer); start = s.End; } buffer.Append(p.Text.Substring(start, p.Span.End - start)); if (p.Type != MaximumEntropyParser.TokenNode) { buffer.Append(")"); } }
public Mention(Util.Span span, Util.Span headSpan, int entityId, IParse parse, string extentType, string nameType) { mSpan = span; mHeadSpan = headSpan; mId = entityId; mType = extentType; mParse = parse; mNameType = nameType; }
public Parse(string parseText, Util.Span span, string type, double probability) { mText = parseText; mSpan = span; mType = type; mProbability = probability; mHead = this; mParts = new List <Parse>(); mLabel = null; mParent = null; }
// constructors ----------------------- public Parse(string parseText, Util.Span span, string type, double probability) { Text = parseText; Span = span; Type = type; Probability = probability; Head = this; _parts = new List <Parse>(); Label = null; Parent = null; }
private static void ClearMentions(Util.Set <IParse> mentions, IParse nounPhrase) { Util.Span nounPhraseSpan = nounPhrase.Span; //loop backwards through the set so that we can remove from the end forwards for (int currentMention = mentions.Count - 1; currentMention > -1; currentMention--) { if (mentions[currentMention].Span.Contains(nounPhraseSpan)) { mentions.Remove(mentions[currentMention]); } } }
private void CollectCoordinatedNounPhraseMentions(IParse nounPhrase, List <Mention> entities) { //System.err.println("collectCoordNp: "+np); List <IParse> nounPhraseTokens = nounPhrase.Tokens; bool inCoordinatedNounPhrase = false; int lastNounPhraseTokenIndex = mHeadFinder.GetHeadIndex(nounPhrase); for (int tokenIndex = lastNounPhraseTokenIndex - 1; tokenIndex >= 0; tokenIndex--) { IParse token = nounPhraseTokens[tokenIndex]; string tokenText = token.ToString(); if (tokenText == "and" || tokenText == "or") { if (lastNounPhraseTokenIndex != tokenIndex) { if (tokenIndex - 1 >= 0 && (nounPhraseTokens[tokenIndex - 1]).SyntacticType.StartsWith("NN")) { Util.Span nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); Mention nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); //System.err.println("adding extent for conjunction in: "+np+" preeceeded by "+((Parse) npTokens.get(ti-1)).getSyntacticType()); inCoordinatedNounPhrase = true; } else { break; } } lastNounPhraseTokenIndex = tokenIndex - 1; } else if (inCoordinatedNounPhrase && tokenText.Equals(",")) { if (lastNounPhraseTokenIndex != tokenIndex) { Util.Span nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); Mention nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); //System.err.println("adding extent for comma in: "+np); } lastNounPhraseTokenIndex = tokenIndex - 1; } else if (inCoordinatedNounPhrase && tokenIndex == 0 && lastNounPhraseTokenIndex >= 0) { Util.Span nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); Mention nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); //System.err.println("adding extent for start coord in: "+np); } } }
/// <summary> /// Tokenizes the string. /// </summary> /// <param name="input"> /// The string to be tokenized. /// </param> /// <returns> /// A span array containing individual tokens as elements. /// </returns> public virtual Util.Span[] TokenizePositions(string input) { Util.Span[] tokens = Split(input); mNewTokens.Clear(); mTokenProbabilities.Clear(); for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++) { Util.Span tokenSpan = tokens[currentToken]; string token = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start)); // Can't tokenize single characters if (token.Length < 2) { mNewTokens.Add(tokenSpan); mTokenProbabilities.Add(1.0); } else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token)) { mNewTokens.Add(tokenSpan); mTokenProbabilities.Add(1.0); } else { int startPosition = tokenSpan.Start; int endPosition = tokenSpan.End; int originalStart = tokenSpan.Start; double tokenProbability = 1.0; for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++) { double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(new Util.Pair <string, int>(token, currentPosition - originalStart))); string bestOutcome = mModel.GetBestOutcome(probabilities); tokenProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)]; if (bestOutcome == TokenContextGenerator.SplitIndicator) { mNewTokens.Add(new Util.Span(startPosition, currentPosition)); mTokenProbabilities.Add(tokenProbability); startPosition = currentPosition; tokenProbability = 1.0; } } mNewTokens.Add(new Util.Span(startPosition, endPosition)); mTokenProbabilities.Add(tokenProbability); } } return(mNewTokens.ToArray()); }
/*/// <summary> * /// Adds a mention for the non-treebank-labeled possesive noun phrases. * /// </summary> * /// <param name="possesiveNounPhrase"> * /// The possessive noun phase which may require an additional mention. * /// </param> * /// <param name="mentions"> * /// The list of mentions into which a new mention can be added. * /// </param> * private void AddPossessiveMentions(IParse possessiveNounPhrase, List<Mention> mentions) * { * List<IParse> kids = possessiveNounPhrase.SyntacticChildren; * if (kids.Count > 1) * { * IParse firstToken = kids[1]; * if (firstToken.IsToken && firstToken.SyntacticType != "POS") * { * IParse lastToken = kids[kids.Count - 1]; * if (lastToken.IsToken) * { * var extentSpan = new Util.Span(firstToken.Span.Start, lastToken.Span.End); * var extent = new Mention(extentSpan, extentSpan, - 1, null, null); * mentions.Add(extent); * } * else * { * Console.Error.WriteLine("AbstractMentionFinder.AddPossessiveMentions: odd parse structure: " + possessiveNounPhrase); * } * } * } * }*/ private void CollectPrenominalNamedEntities(IParse nounPhrase, List <Mention> extents) { IParse headToken = mHeadFinder.GetHeadToken(nounPhrase); List <IParse> namedEntities = nounPhrase.NamedEntities; Util.Span headTokenSpan = headToken.Span; for (int namedEntityIndex = 0; namedEntityIndex < namedEntities.Count; namedEntityIndex++) { IParse namedEntity = namedEntities[namedEntityIndex]; if (!namedEntity.Span.Contains(headTokenSpan)) { var extent = new Mention(namedEntity.Span, namedEntity.Span, namedEntity.EntityId, null, "NAME"); extent.NameType = namedEntity.EntityType; extents.Add(extent); } } }
private void CollectCoordinatedNounPhraseMentions(IParse nounPhrase, List <Mention> entities) { List <IParse> nounPhraseTokens = nounPhrase.Tokens; bool inCoordinatedNounPhrase = false; int lastNounPhraseTokenIndex = mHeadFinder.GetHeadIndex(nounPhrase); for (int tokenIndex = lastNounPhraseTokenIndex - 1; tokenIndex >= 0; tokenIndex--) { IParse token = nounPhraseTokens[tokenIndex]; string tokenText = token.ToString(); if (tokenText == "and" || tokenText == "or") { if (lastNounPhraseTokenIndex != tokenIndex) { if (tokenIndex - 1 >= 0 && PartsOfSpeech.IsNoun(nounPhraseTokens[tokenIndex - 1].SyntacticType)) { var nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); inCoordinatedNounPhrase = true; } else { break; } } lastNounPhraseTokenIndex = tokenIndex - 1; } else if (inCoordinatedNounPhrase && tokenText == PartsOfSpeech.Comma) { if (lastNounPhraseTokenIndex != tokenIndex) { var nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); } lastNounPhraseTokenIndex = tokenIndex - 1; } else if (inCoordinatedNounPhrase && tokenIndex == 0 && lastNounPhraseTokenIndex >= 0) { var nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); } } }
public MentionContext(Util.Span span, Util.Span headSpan, int entityId, IParse parse, string extentType, string nameType, int mentionIndex, int mentionsInSentence, int mentionIndexInDocument, int sentenceIndex, IHeadFinder headFinder) : base(span, headSpan, entityId, parse, extentType, nameType, headFinder) { mNounLocation = mentionIndex; mMaxNounLocation = mentionsInSentence; mNounNumber = mentionIndexInDocument; mSentenceNumber = sentenceIndex; mIndexSpan = parse.Span; mPreviousToken = parse.PreviousToken; mNextToken = parse.NextToken; mHead = headFinder.GetLastHead(parse); List <IParse> headTokens = mHead.Tokens; Tokens = headTokens.ToArray(); mBasalNextToken = mHead.NextToken; //System.err.println("MentionContext.init: "+ent+" "+ent.getEntityId()+" head="+head); //mNonDescriptorStart = 0; InitializeHeads(headFinder.GetHeadIndex(mHead)); mGender = Similarity.GenderEnum.Unknown; mGenderProbability = 0d; mNumber = Similarity.NumberEnum.Unknown; mNumberProbability = 0d; }
///<summary> ///Inserts the specified constituent into this parse based on its text span. This ///method assumes that the specified constituent can be inserted into this parse. ///</summary> ///<param name="constituent"> ///The constituent to be inserted. ///</param> public virtual void Insert(Parse constituent) { Util.Span constituentSpan = constituent.mSpan; if (mSpan.Contains(constituentSpan)) { int currentPart; int partCount = mParts.Count; for (currentPart = 0; currentPart < partCount; currentPart++) { Parse subPart = mParts[currentPart]; Util.Span subPartSpan = subPart.mSpan; if (subPartSpan.Start > constituentSpan.End) { break; } // constituent Contains subPart else if (constituentSpan.Contains(subPartSpan)) { mParts.RemoveAt(currentPart); currentPart--; constituent.mParts.Add(subPart); subPart.Parent = constituent; partCount = mParts.Count; } else if (subPartSpan.Contains(constituentSpan)) { //System.Console.WriteLine("Parse.insert:subPart contains con"); subPart.Insert(constituent); return; } } mParts.Insert(currentPart, constituent); constituent.Parent = this; } else { throw new ParseException("Inserting constituent not contained in the sentence!"); } }
public MentionContext(Util.Span span, Util.Span headSpan, int entityId, IParse parse, string extentType, string nameType, int mentionIndex, int mentionsInSentence, int mentionIndexInDocument, int sentenceIndex, IHeadFinder headFinder) : base(span, headSpan, entityId, parse, extentType, nameType, headFinder) { NounPhraseSentenceIndex = mentionIndex; MaxNounPhraseSentenceIndex = mentionsInSentence; NounPhraseDocumentIndex = mentionIndexInDocument; SentenceNumber = sentenceIndex; IndexSpan = parse.Span; PreviousToken = parse.PreviousToken; NextToken = parse.NextToken; Head = headFinder.GetLastHead(parse); List <IParse> headTokens = Head.Tokens; Tokens = headTokens.ToArray(); NextTokenBasal = Head.NextToken; //mNonDescriptorStart = 0; InitializeHeads(headFinder.GetHeadIndex(Head)); mGender = Similarity.GenderEnum.Unknown; GenderProbability = 0d; _number = Similarity.NumberEnum.Unknown; NumberProbability = 0d; }
private void CollectCoordinatedNounPhraseMentions(IParse nounPhrase, List<Mention> entities) { //System.err.println("collectCoordNp: "+np); List<IParse> nounPhraseTokens = nounPhrase.Tokens; bool inCoordinatedNounPhrase = false; int lastNounPhraseTokenIndex = mHeadFinder.GetHeadIndex(nounPhrase); for (int tokenIndex = lastNounPhraseTokenIndex - 1; tokenIndex >= 0; tokenIndex--) { IParse token = nounPhraseTokens[tokenIndex]; string tokenText = token.ToString(); if (tokenText == "and" || tokenText == "or") { if (lastNounPhraseTokenIndex != tokenIndex) { if (tokenIndex - 1 >= 0 && (nounPhraseTokens[tokenIndex - 1]).SyntacticType.StartsWith("NN")) { Util.Span nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); Mention nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); //System.err.println("adding extent for conjunction in: "+np+" preeceeded by "+((Parse) npTokens.get(ti-1)).getSyntacticType()); inCoordinatedNounPhrase = true; } else { break; } } lastNounPhraseTokenIndex = tokenIndex - 1; } else if (inCoordinatedNounPhrase && tokenText.Equals(",")) { if (lastNounPhraseTokenIndex != tokenIndex) { Util.Span nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); Mention nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); //System.err.println("adding extent for comma in: "+np); } lastNounPhraseTokenIndex = tokenIndex - 1; } else if (inCoordinatedNounPhrase && tokenIndex == 0 && lastNounPhraseTokenIndex >= 0) { Util.Span nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); Mention nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); //System.err.println("adding extent for start coord in: "+np); } } }
private void CollectCoordinatedNounPhraseMentions(IParse nounPhrase, List<Mention> entities) { List<IParse> nounPhraseTokens = nounPhrase.Tokens; bool inCoordinatedNounPhrase = false; int lastNounPhraseTokenIndex = mHeadFinder.GetHeadIndex(nounPhrase); for (int tokenIndex = lastNounPhraseTokenIndex - 1; tokenIndex >= 0; tokenIndex--) { IParse token = nounPhraseTokens[tokenIndex]; string tokenText = token.ToString(); if (tokenText == "and" || tokenText == "or") { if (lastNounPhraseTokenIndex != tokenIndex) { if (tokenIndex - 1 >= 0 && PartsOfSpeech.IsNoun(nounPhraseTokens[tokenIndex - 1].SyntacticType)) { var nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); inCoordinatedNounPhrase = true; } else { break; } } lastNounPhraseTokenIndex = tokenIndex - 1; } else if (inCoordinatedNounPhrase && tokenText == PartsOfSpeech.Comma) { if (lastNounPhraseTokenIndex != tokenIndex) { var nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex + 1]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); } lastNounPhraseTokenIndex = tokenIndex - 1; } else if (inCoordinatedNounPhrase && tokenIndex == 0 && lastNounPhraseTokenIndex >= 0) { var nounPhraseSpan = new Util.Span((nounPhraseTokens[tokenIndex]).Span.Start, (nounPhraseTokens[lastNounPhraseTokenIndex]).Span.End); var nounPhraseSpanExtent = new Mention(nounPhraseSpan, nounPhraseSpan, token.EntityId, null, "CNP"); entities.Add(nounPhraseSpanExtent); } } }
public virtual void AddEvents(Util.Span[] tokens, string input) { if (tokens.Length > 0) { int startPosition = tokens[0].Start; int endPosition = tokens[tokens.Length - 1].End; string sentence = input.Substring(startPosition, (endPosition) - (startPosition)); Util.Span[] candidateTokens = MaximumEntropyTokenizer.Split(sentence); int firstTrainingToken = -1; int lastTrainingToken = -1; for (int currentCandidate = 0; currentCandidate < candidateTokens.Length; currentCandidate++) { Util.Span candidateSpan = candidateTokens[currentCandidate]; string candidateToken = sentence.Substring(candidateSpan.Start, (candidateSpan.End) - (candidateSpan.Start)); //adjust candidateSpan to text offsets candidateSpan = new Util.Span(candidateSpan.Start + startPosition, candidateSpan.End + startPosition); //should we skip this token if (candidateToken.Length > 1 && (!mSkipAlphanumerics || !MaximumEntropyTokenizer.AlphaNumeric.IsMatch(candidateToken))) { //find offsets of annotated tokens inside candidate tokens bool foundTrainingTokens = false; for (int currentToken = lastTrainingToken + 1; currentToken < tokens.Length; currentToken++) { if (candidateSpan.Contains(tokens[currentToken])) { if (!foundTrainingTokens) { firstTrainingToken = currentToken; foundTrainingTokens = true; } lastTrainingToken = currentToken; } else if (candidateSpan.End < tokens[currentToken].End) { break; } else if (tokens[currentToken].End < candidateSpan.Start) { //keep looking } else { throw new ApplicationException("Bad training token: " + tokens[currentToken] + " cand: " + candidateSpan); } } // create training data if (foundTrainingTokens) { for (int currentToken = firstTrainingToken; currentToken <= lastTrainingToken; currentToken++) { Util.Span trainingTokenSpan = tokens[currentToken]; int candidateStart = candidateSpan.Start; for (int currentPosition = trainingTokenSpan.Start + 1; currentPosition < trainingTokenSpan.End; currentPosition++) { string[] context = mContextGenerator.GetContext(new Util.Pair<string, int>(candidateToken, currentPosition - candidateStart)); mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.NoSplitIndicator, context)); } if (trainingTokenSpan.End != candidateSpan.End) { string[] context = mContextGenerator.GetContext(new Util.Pair<string, int>(candidateToken, trainingTokenSpan.End - candidateStart)); mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.SplitIndicator, context)); } } } } } } }
public Parse(string parseText, Util.Span span, string type, double probability, Parse head) : this(parseText, span, type, probability) { mHead = head; }
public Parse(string parseText, Util.Span span, string type, double probability) { mText = parseText; mSpan = span; mType = type; mProbability = probability; mHead = this; mParts = new List<Parse>(); mLabel = null; mParent = null; }
// Constructors -------------------- public Context(Util.Span span, Util.Span headSpan, int entityId, Mention.IParse parse, string extentType, string nameType, Mention.IHeadFinder headFinder) : base(span, headSpan, entityId, parse, extentType, nameType) { Initialize(headFinder); }
public MentionContext(Util.Span span, Util.Span headSpan, int entityId, IParse parse, string extentType, string nameType, int mentionIndex, int mentionsInSentence, int mentionIndexInDocument, int sentenceIndex, IHeadFinder headFinder) : base(span, headSpan, entityId, parse, extentType, nameType, headFinder) { mNounLocation = mentionIndex; mMaxNounLocation = mentionsInSentence; mNounNumber = mentionIndexInDocument; mSentenceNumber = sentenceIndex; mIndexSpan = parse.Span; mPreviousToken = parse.PreviousToken; mNextToken = parse.NextToken; mHead = headFinder.GetLastHead(parse); List<IParse> headTokens = mHead.Tokens; Tokens = headTokens.ToArray(); mBasalNextToken = mHead.NextToken; //System.err.println("MentionContext.init: "+ent+" "+ent.getEntityId()+" head="+head); //mNonDescriptorStart = 0; InitializeHeads(headFinder.GetHeadIndex(mHead)); mGender = Similarity.GenderEnum.Unknown; mGenderProbability = 0d; mNumber = Similarity.NumberEnum.Unknown; mNumberProbability = 0d; }
public virtual void AddEvents(Util.Span[] tokens, string input) { if (tokens.Length > 0) { int startPosition = tokens[0].Start; int endPosition = tokens[tokens.Length - 1].End; string sentence = input.Substring(startPosition, (endPosition) - (startPosition)); Util.Span[] candidateTokens = MaximumEntropyTokenizer.SplitOnWhitespaces(sentence); int firstTrainingToken = -1; int lastTrainingToken = -1; for (int currentCandidate = 0; currentCandidate < candidateTokens.Length; currentCandidate++) { Util.Span candidateSpan = candidateTokens[currentCandidate]; string candidateToken = sentence.Substring(candidateSpan.Start, (candidateSpan.End) - (candidateSpan.Start)); //adjust candidateSpan to text offsets candidateSpan = new Util.Span(candidateSpan.Start + startPosition, candidateSpan.End + startPosition); //should we skip this token if (candidateToken.Length > 1 && (!mSkipAlphanumerics || !MaximumEntropyTokenizer.AlphaNumeric.IsMatch(candidateToken))) { //find offsets of annotated tokens inside candidate tokens bool foundTrainingTokens = false; for (int currentToken = lastTrainingToken + 1; currentToken < tokens.Length; currentToken++) { if (candidateSpan.Contains(tokens[currentToken])) { if (!foundTrainingTokens) { firstTrainingToken = currentToken; foundTrainingTokens = true; } lastTrainingToken = currentToken; } else if (candidateSpan.End < tokens[currentToken].End) { break; } else if (tokens[currentToken].End < candidateSpan.Start) { //keep looking } else { throw new ApplicationException("Bad training token: " + tokens[currentToken] + " cand: " + candidateSpan); } } // create training data if (foundTrainingTokens) { for (int currentToken = firstTrainingToken; currentToken <= lastTrainingToken; currentToken++) { Util.Span trainingTokenSpan = tokens[currentToken]; int candidateStart = candidateSpan.Start; for (int currentPosition = trainingTokenSpan.Start + 1; currentPosition < trainingTokenSpan.End; currentPosition++) { string[] context = mContextGenerator.GetContext(new Tuple <string, int>(candidateToken, currentPosition - candidateStart)); mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.NoSplitIndicator, context)); } if (trainingTokenSpan.End != candidateSpan.End) { string[] context = mContextGenerator.GetContext(new Tuple <string, int>(candidateToken, trainingTokenSpan.End - candidateStart)); mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.SplitIndicator, context)); } } } } } } }