///<summary> ///Inserts the specified constituent into this parse based on its text span. This ///method assumes that the specified constituent can be inserted into this parse. ///</summary> ///<param name="constituent"> ///The constituent to be inserted. ///</param> public virtual void Insert(Parse constituent) { Util.Span constituentSpan = constituent.mSpan; if (mSpan.Contains(constituentSpan)) { int currentPart; int partCount = mParts.Count; for (currentPart = 0; currentPart < partCount; currentPart++) { Parse subPart = mParts[currentPart]; Util.Span subPartSpan = subPart.mSpan; if (subPartSpan.Start > constituentSpan.End) { break; } // constituent Contains subPart else if (constituentSpan.Contains(subPartSpan)) { mParts.RemoveAt(currentPart); currentPart--; constituent.mParts.Add(subPart); subPart.Parent = constituent; partCount = mParts.Count; } else if (subPartSpan.Contains(constituentSpan)) { //System.Console.WriteLine("Parse.insert:subPart contains con"); subPart.Insert(constituent); return; } } mParts.Insert(currentPart, constituent); constituent.Parent = this; } else { throw new ParseException("Inserting constituent not contained in the sentence!"); } }
public virtual void AddEvents(Util.Span[] tokens, string input) { if (tokens.Length > 0) { int startPosition = tokens[0].Start; int endPosition = tokens[tokens.Length - 1].End; string sentence = input.Substring(startPosition, (endPosition) - (startPosition)); Util.Span[] candidateTokens = MaximumEntropyTokenizer.SplitOnWhitespaces(sentence); int firstTrainingToken = -1; int lastTrainingToken = -1; for (int currentCandidate = 0; currentCandidate < candidateTokens.Length; currentCandidate++) { Util.Span candidateSpan = candidateTokens[currentCandidate]; string candidateToken = sentence.Substring(candidateSpan.Start, (candidateSpan.End) - (candidateSpan.Start)); //adjust candidateSpan to text offsets candidateSpan = new Util.Span(candidateSpan.Start + startPosition, candidateSpan.End + startPosition); //should we skip this token if (candidateToken.Length > 1 && (!mSkipAlphanumerics || !MaximumEntropyTokenizer.AlphaNumeric.IsMatch(candidateToken))) { //find offsets of annotated tokens inside candidate tokens bool foundTrainingTokens = false; for (int currentToken = lastTrainingToken + 1; currentToken < tokens.Length; currentToken++) { if (candidateSpan.Contains(tokens[currentToken])) { if (!foundTrainingTokens) { firstTrainingToken = currentToken; foundTrainingTokens = true; } lastTrainingToken = currentToken; } else if (candidateSpan.End < tokens[currentToken].End) { break; } else if (tokens[currentToken].End < candidateSpan.Start) { //keep looking } else { throw new ApplicationException("Bad training token: " + tokens[currentToken] + " cand: " + candidateSpan); } } // create training data if (foundTrainingTokens) { for (int currentToken = firstTrainingToken; currentToken <= lastTrainingToken; currentToken++) { Util.Span trainingTokenSpan = tokens[currentToken]; int candidateStart = candidateSpan.Start; for (int currentPosition = trainingTokenSpan.Start + 1; currentPosition < trainingTokenSpan.End; currentPosition++) { string[] context = mContextGenerator.GetContext(new Tuple <string, int>(candidateToken, currentPosition - candidateStart)); mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.NoSplitIndicator, context)); } if (trainingTokenSpan.End != candidateSpan.End) { string[] context = mContextGenerator.GetContext(new Tuple <string, int>(candidateToken, trainingTokenSpan.End - candidateStart)); mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.SplitIndicator, context)); } } } } } } }
public virtual void AddEvents(Util.Span[] tokens, string input) { if (tokens.Length > 0) { int startPosition = tokens[0].Start; int endPosition = tokens[tokens.Length - 1].End; string sentence = input.Substring(startPosition, (endPosition) - (startPosition)); Util.Span[] candidateTokens = MaximumEntropyTokenizer.Split(sentence); int firstTrainingToken = -1; int lastTrainingToken = -1; for (int currentCandidate = 0; currentCandidate < candidateTokens.Length; currentCandidate++) { Util.Span candidateSpan = candidateTokens[currentCandidate]; string candidateToken = sentence.Substring(candidateSpan.Start, (candidateSpan.End) - (candidateSpan.Start)); //adjust candidateSpan to text offsets candidateSpan = new Util.Span(candidateSpan.Start + startPosition, candidateSpan.End + startPosition); //should we skip this token if (candidateToken.Length > 1 && (!mSkipAlphanumerics || !MaximumEntropyTokenizer.AlphaNumeric.IsMatch(candidateToken))) { //find offsets of annotated tokens inside candidate tokens bool foundTrainingTokens = false; for (int currentToken = lastTrainingToken + 1; currentToken < tokens.Length; currentToken++) { if (candidateSpan.Contains(tokens[currentToken])) { if (!foundTrainingTokens) { firstTrainingToken = currentToken; foundTrainingTokens = true; } lastTrainingToken = currentToken; } else if (candidateSpan.End < tokens[currentToken].End) { break; } else if (tokens[currentToken].End < candidateSpan.Start) { //keep looking } else { throw new ApplicationException("Bad training token: " + tokens[currentToken] + " cand: " + candidateSpan); } } // create training data if (foundTrainingTokens) { for (int currentToken = firstTrainingToken; currentToken <= lastTrainingToken; currentToken++) { Util.Span trainingTokenSpan = tokens[currentToken]; int candidateStart = candidateSpan.Start; for (int currentPosition = trainingTokenSpan.Start + 1; currentPosition < trainingTokenSpan.End; currentPosition++) { string[] context = mContextGenerator.GetContext(new Util.Pair<string, int>(candidateToken, currentPosition - candidateStart)); mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.NoSplitIndicator, context)); } if (trainingTokenSpan.End != candidateSpan.End) { string[] context = mContextGenerator.GetContext(new Util.Pair<string, int>(candidateToken, trainingTokenSpan.End - candidateStart)); mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.SplitIndicator, context)); } } } } } } }