private void AddTagEvents(List <SharpEntropy.TrainingEvent> events, Parse[] chunks) { List <string> tokens = new List <string>(); List <string> predicates = new List <string>(); for (int currentChunk = 0; currentChunk < chunks.Length; currentChunk++) { Parse chunkParse = chunks[currentChunk]; if (chunkParse.IsPosTag) { tokens.Add(chunkParse.ToString()); predicates.Add(chunkParse.Type); } else { Parse[] childParses = chunkParse.GetChildren(); foreach (Parse tokenParse in childParses) { tokens.Add(tokenParse.ToString()); predicates.Add(tokenParse.Type); } } } for (int currentToken = 0; currentToken < tokens.Count; currentToken++) { events.Add(new SharpEntropy.TrainingEvent(predicates[currentToken], mPosContextGenerator.GetContext(currentToken, tokens.ToArray(), predicates.ToArray(), null))); } }
public string DoParse(string[] lines, int requestedParses) { System.Text.StringBuilder parseStringBuilder = new System.Text.StringBuilder(); foreach (string line in lines) { System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder(); string[] rawTokens = mTokenizer.Tokenize(line); ArrayList tokens = new ArrayList(); foreach (string rawToken in rawTokens) { string convertedToken = ConvertToken(rawToken); tokens.Add(convertedToken); lineBuilder.Append(convertedToken).Append(" "); } if (lineBuilder.Length != 0) { string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString(); Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null); int start = 0; foreach (string token in tokens) { currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0)); start += token.Length + 1; } Parse[] parses = mParser.FullParse(currentParse, requestedParses); for (int currentParseIndex = 0, parseCount = parses.Length; currentParseIndex < parseCount; currentParseIndex++) { if (requestedParses > 1) { lineBuilder.Append(currentParse.ToString() + " " + parses[currentParseIndex].Probability.ToString(System.Globalization.CultureInfo.InvariantCulture) + " "); } lineBuilder.Append(parses[currentParseIndex].Show()); parseStringBuilder.Append(lineBuilder.ToString()); } } else { parseStringBuilder.Append("\r\n"); } } return(parseStringBuilder.ToString()); }
private void AddChunkEvents(List <SharpEntropy.TrainingEvent> events, Parse[] chunks) { List <string> tokens = new List <string>(); List <string> tags = new List <string>(); List <string> predicates = new List <string>(); for (int currentChunk = 0; currentChunk < chunks.Length; currentChunk++) { Parse chunkParse = chunks[currentChunk]; if (chunkParse.IsPosTag) { tokens.Add(chunkParse.ToString()); tags.Add(chunkParse.Type); predicates.Add(MaximumEntropyParser.OtherOutcome); } else { bool isStart = true; string chunkType = chunkParse.Type; Parse[] childParses = chunkParse.GetChildren(); foreach (Parse tokenParse in childParses) { tokens.Add(tokenParse.ToString()); tags.Add(tokenParse.Type); if (isStart) { predicates.Add(MaximumEntropyParser.StartPrefix + chunkType); isStart = false; } else { predicates.Add(MaximumEntropyParser.ContinuePrefix + chunkType); } } } } for (int currentToken = 0; currentToken < tokens.Count; currentToken++) { events.Add(new SharpEntropy.TrainingEvent(predicates[currentToken], mChunkContextGenerator.GetContext(currentToken, tokens.ToArray(), tags.ToArray(), predicates.ToArray()))); } }
/// <summary> /// Returns the predictive context used to determine how the constituent at the specified index /// should be combined with other constituents. /// </summary> /// <param name="constituents"> /// The constituents which have yet to be combined into new constituents. /// </param> /// <param name="index"> /// The index of the constituent whcihi is being considered. /// </param> /// <returns> /// the context for building constituents at the specified index. /// </returns> public virtual string[] GetContext(Parse[] constituents, int index) { List <string> features = new List <string>(100); int constituentCount = constituents.Length; //default features.Add("default"); // cons(-2), cons(-1), cons(0), cons(1), cons(2) // cons(-2) Parse previousPreviousParse = null; Parse previousParse = null; Parse currentParse = null; Parse nextParse = null; Parse nextNextParse = null; if (index - 2 >= 0) { previousPreviousParse = constituents[index - 2]; } if (index - 1 >= 0) { previousParse = constituents[index - 1]; } currentParse = constituents[index]; if (index + 1 < constituentCount) { nextParse = constituents[index + 1]; } if (index + 2 < constituentCount) { nextNextParse = constituents[index + 2]; } // cons(-2), cons(-1), cons(0), cons(1), cons(2) string previousPreviousConstituent = MakeConstituent(previousPreviousParse, -2); string previousConstituent = MakeConstituent(previousParse, -1); string currentConstituent = MakeConstituent(currentParse, 0); string nextConstituent = MakeConstituent(nextParse, 1); string nextNextConstituent = MakeConstituent(nextNextParse, 2); string previousPreviousConstituentBackOff = MakeConstituentBackOff(previousPreviousParse, -2); string previousConstituentBackOff = MakeConstituentBackOff(previousParse, -1); string currentConstituentBackOff = MakeConstituentBackOff(currentParse, 0); string nextConstituentBackOff = MakeConstituentBackOff(nextParse, 1); string nextNextConstituentBackOff = MakeConstituentBackOff(nextNextParse, 2); // cons(-2), cons(-1), cons(0), cons(1), cons(2) features.Add(previousPreviousConstituent); features.Add(previousPreviousConstituentBackOff); features.Add(previousConstituent); features.Add(previousConstituentBackOff); features.Add(currentConstituent); features.Add(currentConstituentBackOff); features.Add(nextConstituent); features.Add(nextConstituentBackOff); features.Add(nextNextConstituent); features.Add(nextNextConstituentBackOff); // cons(-1,0), cons(0,1) features.Add(previousConstituent + "," + currentConstituent); features.Add(previousConstituentBackOff + "," + currentConstituent); features.Add(previousConstituent + "," + currentConstituentBackOff); features.Add(previousConstituentBackOff + "," + currentConstituentBackOff); features.Add(currentConstituent + "," + nextConstituent); features.Add(currentConstituentBackOff + "," + nextConstituent); features.Add(currentConstituent + "," + nextConstituentBackOff); features.Add(currentConstituentBackOff + "," + nextConstituentBackOff); // cons3(-2,-1,0), cons3(-1,0,1), cons3(0,1,2) features.Add(previousPreviousConstituent + "," + previousConstituent + "," + currentConstituent); features.Add(previousPreviousConstituentBackOff + "," + previousConstituent + "," + currentConstituent); features.Add(previousPreviousConstituent + "," + previousConstituentBackOff + "," + currentConstituent); features.Add(previousPreviousConstituentBackOff + "," + previousConstituentBackOff + "," + currentConstituent); features.Add(previousPreviousConstituentBackOff + "," + previousConstituentBackOff + "," + currentConstituentBackOff); features.Add(previousConstituent + "," + currentConstituent + "," + nextConstituent); features.Add(previousConstituentBackOff + "," + currentConstituent + "," + nextConstituent); features.Add(previousConstituent + "," + currentConstituent + "," + nextConstituentBackOff); features.Add(previousConstituentBackOff + "," + currentConstituent + "," + nextConstituentBackOff); features.Add(previousConstituentBackOff + "," + currentConstituentBackOff + "," + nextConstituentBackOff); features.Add(currentConstituent + "," + nextConstituent + "," + nextNextConstituent); features.Add(currentConstituent + "," + nextConstituentBackOff + "," + nextNextConstituent); features.Add(currentConstituent + "," + nextConstituent + "," + nextNextConstituentBackOff); features.Add(currentConstituent + "," + nextConstituentBackOff + "," + nextNextConstituentBackOff); features.Add(currentConstituentBackOff + "," + nextConstituentBackOff + "," + nextNextConstituentBackOff); // punct string currentParseWord = currentParse.ToString(); if (currentParseWord == "-RRB-") { for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--) { Parse testParse = constituents[parseIndex]; if (testParse.ToString() == "-LRB-") { features.Add("bracketsmatch"); break; } if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix)) { break; } } } if (currentParseWord == "-RCB-") { for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--) { Parse testParse = constituents[parseIndex]; if (testParse.ToString() == "-LCB-") { features.Add("bracketsmatch"); break; } if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix)) { break; } } } if (currentParseWord == PartsOfSpeech.RightCloseDoubleQuote) { for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--) { Parse testParse = constituents[parseIndex]; if (testParse.ToString() == PartsOfSpeech.LeftOpenDoubleQuote) { features.Add("quotesmatch"); break; } if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix)) { break; } } } if (currentParseWord == "'") { for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--) { Parse testParse = constituents[parseIndex]; if (testParse.ToString() == "`") { features.Add("quotesmatch"); break; } if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix)) { break; } } } if (currentParseWord == PartsOfSpeech.Comma) { for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--) { Parse testParse = constituents[parseIndex]; if (testParse.ToString() == PartsOfSpeech.Comma) { features.Add("iscomma"); break; } if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix)) { break; } } } if (currentParseWord == PartsOfSpeech.SentenceFinalPunctuation && index == constituentCount - 1) { for (int parseIndex = index - 1; parseIndex >= 0; parseIndex--) { Parse testParse = constituents[parseIndex]; if (testParse.Label.StartsWith(MaximumEntropyParser.StartPrefix)) { if (parseIndex == 0) { features.Add("endofsentence"); } break; } } } return(features.ToArray()); }
public string DoParse(string[] lines, int requestedParses) { System.Text.StringBuilder parseStringBuilder = new System.Text.StringBuilder(); foreach (string line in lines) { System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder(); string[] rawTokens = mTokenizer.Tokenize(line); ArrayList tokens = new ArrayList(); foreach (string rawToken in rawTokens) { string convertedToken = ConvertToken(rawToken); tokens.Add(convertedToken); lineBuilder.Append(convertedToken).Append(" "); } if (lineBuilder.Length != 0) { string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString(); Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null); int start = 0; foreach (string token in tokens) { currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0)); start += token.Length + 1; } Parse[] parses = mParser.FullParse(currentParse, requestedParses); for (int currentParseIndex = 0, parseCount = parses.Length; currentParseIndex < parseCount; currentParseIndex++) { if (requestedParses > 1) { lineBuilder.Append(currentParse.ToString() + " " + parses[currentParseIndex].Probability.ToString(System.Globalization.CultureInfo.InvariantCulture) + " "); } lineBuilder.Append(parses[currentParseIndex].Show()); parseStringBuilder.Append(lineBuilder.ToString()); } } else { parseStringBuilder.Append("\r\n"); } } return parseStringBuilder.ToString(); }