public Parse[] DoParse(string line, int requestedParses) { System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder(); string[] rawTokens = mTokenizer.Tokenize(line); ArrayList tokens = new ArrayList(); foreach (string rawToken in rawTokens) { string convertedToken = ConvertToken(rawToken); tokens.Add(convertedToken); lineBuilder.Append(convertedToken).Append(" "); } if (lineBuilder.Length != 0) { string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString(); Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null); int start = 0; foreach (string token in tokens) { currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0)); start += token.Length + 1; } Parse[] parses = mParser.FullParse(currentParse, requestedParses); return(parses); } else { return(null); } }
private Parse[] DoParse(IEnumerable <string> tokens, int requestedParses) { var lineBuilder = new System.Text.StringBuilder(); var convertedTokens = new List <string>(); foreach (string rawToken in tokens) { string convertedToken = ConvertToken(rawToken); convertedTokens.Add(convertedToken); lineBuilder.Append(convertedToken).Append(" "); } if (lineBuilder.Length != 0) { string text = lineBuilder.ToString(0, lineBuilder.Length - 1); var currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null); int start = 0; foreach (string token in convertedTokens) { currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0)); start += token.Length + 1; } Parse[] parses = _parser.FullParse(currentParse, requestedParses); return(parses); } else { return(null); } }
/// <summary> /// Generates a Parse structure from the specified tree-bank style parse string. /// </summary> /// <param name="parse"> /// A tree-bank style parse string. /// </param> /// <returns> /// a Parse structure for the specified tree-bank style parse string. /// </returns> public static Parse FromParseString(string parse) { var textBuffer = new StringBuilder(); int offset = 0; var parseStack = new Stack <Tuple <string, int> >(); var consitutents = new List <Tuple <string, Util.Span> >(); for (int currentChar = 0, charCount = parse.Length; currentChar < charCount; currentChar++) { char c = parse[currentChar]; if (c == '(') { string rest = parse.Substring(currentChar + 1); string type = GetType(rest); if (type == null) { throw new ParseException("null type for: " + rest); } string token = GetToken(rest); parseStack.Push(new Tuple <string, int>(type, offset)); if ((object)token != null && type != "-NONE-") { consitutents.Add(new Tuple <string, Util.Span>(MaximumEntropyParser.TokenNode, new Util.Span(offset, offset + token.Length))); textBuffer.Append(token).Append(" "); offset += token.Length + 1; } } else if (c == ')') { Tuple <string, int> parts = parseStack.Pop(); string type = parts.Item1; if (type != "-NONE-") { int start = parts.Item2; consitutents.Add(new Tuple <string, Util.Span>(parts.Item1, new Util.Span(start, offset - 1))); } } } string text = textBuffer.ToString(); var rootParse = new Parse(text, new Util.Span(0, text.Length), MaximumEntropyParser.TopNode, 1); for (int currentConstituent = 0, constituentCount = consitutents.Count; currentConstituent < constituentCount; currentConstituent++) { Tuple <string, Util.Span> parts = consitutents[currentConstituent]; string type = parts.Item1; if (type != MaximumEntropyParser.TopNode) { var newConstituent = new Parse(text, parts.Item2, type, 1); rootParse.Insert(newConstituent); } } return(rootParse); }
public string DoParse(string[] lines, int requestedParses) { System.Text.StringBuilder parseStringBuilder = new System.Text.StringBuilder(); foreach (string line in lines) { System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder(); string[] rawTokens = mTokenizer.Tokenize(line); ArrayList tokens = new ArrayList(); foreach (string rawToken in rawTokens) { string convertedToken = ConvertToken(rawToken); tokens.Add(convertedToken); lineBuilder.Append(convertedToken).Append(" "); } if (lineBuilder.Length != 0) { string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString(); Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null); int start = 0; foreach (string token in tokens) { currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0)); start += token.Length + 1; } Parse[] parses = mParser.FullParse(currentParse, requestedParses); for (int currentParseIndex = 0, parseCount = parses.Length; currentParseIndex < parseCount; currentParseIndex++) { if (requestedParses > 1) { lineBuilder.Append(currentParse.ToString() + " " + parses[currentParseIndex].Probability.ToString(System.Globalization.CultureInfo.InvariantCulture) + " "); } lineBuilder.Append(parses[currentParseIndex].Show()); parseStringBuilder.Append(lineBuilder.ToString()); } } else { parseStringBuilder.Append("\r\n"); } } return(parseStringBuilder.ToString()); }
///<summary> ///Inserts the specified constituent into this parse based on its text span. This ///method assumes that the specified constituent can be inserted into this parse. ///</summary> ///<param name="constituent"> ///The constituent to be inserted. ///</param> public virtual void Insert(Parse constituent) { Util.Span constituentSpan = constituent.Span; if (Span.Contains(constituentSpan)) { int currentPart; int partCount = _parts.Count; for (currentPart = 0; currentPart < partCount; currentPart++) { Parse subPart = _parts[currentPart]; Util.Span subPartSpan = subPart.Span; if (subPartSpan.Start > constituentSpan.End) { break; } // constituent Contains subPart else if (constituentSpan.Contains(subPartSpan)) { _parts.RemoveAt(currentPart); currentPart--; constituent._parts.Add(subPart); subPart.Parent = constituent; partCount = _parts.Count; } else if (subPartSpan.Contains(constituentSpan)) { //System.Console.WriteLine("Parse.insert:subPart contains con"); subPart.Insert(constituent); return; } } _parts.Insert(currentPart, constituent); constituent.Parent = this; } else { throw new ParseException("Inserting constituent not contained in the sentence!"); } }
///<summary> ///Advances the specified parse and returns the an array advanced parses whose probability accounts for ///more than the speicficed amount of probability mass, Q. ///</summary> ///<param name="inputParse"> ///The parse to advance. ///</param> ///<param name="qParam"> ///The amount of probability mass that should be accounted for by the advanced parses. ///</param> private Parse[] AdvanceParses(Parse inputParse, double qParam, double[] buildProbabilities, double[] checkProbabilities) { double qOpp = 1 - qParam; Parse lastStartNode = null; // The closest previous node which has been labeled as a start node. int lastStartIndex = -1; // The index of the closest previous node which has been labeled as a start node. string lastStartType = null; // The type of the closest previous node which has been labeled as a start node. int advanceNodeIndex; // The index of the node which will be labeled in this iteration of advancing the parse. Parse advanceNode = null; // The node which will be labeled in this iteration of advancing the parse. Parse[] children = inputParse.GetChildren(); int nodeCount = children.Length; //determines which node needs to be labeled and prior labels. for (advanceNodeIndex = 0; advanceNodeIndex < nodeCount; advanceNodeIndex++) { advanceNode = children[advanceNodeIndex]; if (advanceNode.Label == null) { break; } else if (startTypeMap.ContainsKey(advanceNode.Label)) { lastStartType = startTypeMap[advanceNode.Label]; lastStartNode = advanceNode; lastStartIndex = advanceNodeIndex; } } var newParsesList = new List <Parse>(buildModel.OutcomeCount); //call build buildModel.Evaluate(buildContextGenerator.GetContext(children, advanceNodeIndex), buildProbabilities); double buildProbabilitiesSum = 0; while (buildProbabilitiesSum < qParam) { // The largest unadvanced labeling. int highestBuildProbabilityIndex = 0; for (int probabilityIndex = 1; probabilityIndex < buildProbabilities.Length; probabilityIndex++) { //for each build outcome if (buildProbabilities[probabilityIndex] > buildProbabilities[highestBuildProbabilityIndex]) { highestBuildProbabilityIndex = probabilityIndex; } } if (buildProbabilities[highestBuildProbabilityIndex] == 0) { break; } double highestBuildProbability = buildProbabilities[highestBuildProbabilityIndex]; buildProbabilities[highestBuildProbabilityIndex] = 0; //zero out so new max can be found buildProbabilitiesSum += highestBuildProbability; string tag = buildModel.GetOutcomeName(highestBuildProbabilityIndex); //System.Console.Out.WriteLine("trying " + tag + " " + buildProbabilitiesSum + " lst=" + lst); if (highestBuildProbabilityIndex == topStartIndex) { // can't have top until complete continue; } //System.Console.Error.WriteLine(probabilityIndex + " " + tag + " " + highestBuildProbability); if (startTypeMap.ContainsKey(tag)) { //update last start lastStartIndex = advanceNodeIndex; lastStartNode = advanceNode; lastStartType = startTypeMap[tag]; } else if (continueTypeMap.ContainsKey(tag)) { if (lastStartNode == null || lastStartType != continueTypeMap[tag]) { continue; //Cont must match previous start or continue } } var newParse1 = (Parse)inputParse.Clone(); //clone parse if (CreateDerivationString) { newParse1.AppendDerivationBuffer(highestBuildProbabilityIndex.ToString(System.Globalization.CultureInfo.InvariantCulture)); newParse1.AppendDerivationBuffer("-"); } newParse1.SetChild(advanceNodeIndex, tag); //replace constituent labeled newParse1.AddProbability(Math.Log(highestBuildProbability)); //check checkModel.Evaluate(checkContextGenerator.GetContext(newParse1.GetChildren(), lastStartType, lastStartIndex, advanceNodeIndex), checkProbabilities); //System.Console.Out.WriteLine("check " + mCheckProbabilities[mCompleteIndex] + " " + mCheckProbabilities[mIncompleteIndex]); Parse newParse2 = newParse1; if (checkProbabilities[completeIndex] > qOpp) { //make sure a reduce is likely newParse2 = (Parse)newParse1.Clone(); if (CreateDerivationString) { newParse2.AppendDerivationBuffer("1"); newParse2.AppendDerivationBuffer("."); } newParse2.AddProbability(System.Math.Log(checkProbabilities[1])); var constituent = new Parse[advanceNodeIndex - lastStartIndex + 1]; bool isFlat = true; //first constituent[0] = lastStartNode; if (constituent[0].Type != constituent[0].Head.Type) { isFlat = false; } //last constituent[advanceNodeIndex - lastStartIndex] = advanceNode; if (isFlat && constituent[advanceNodeIndex - lastStartIndex].Type != constituent[advanceNodeIndex - lastStartIndex].Head.Type) { isFlat = false; } //middle for (int constituentIndex = 1; constituentIndex < advanceNodeIndex - lastStartIndex; constituentIndex++) { constituent[constituentIndex] = children[constituentIndex + lastStartIndex]; if (isFlat && constituent[constituentIndex].Type != constituent[constituentIndex].Head.Type) { isFlat = false; } } if (!isFlat) { //flat chunks are done by chunker newParse2.Insert(new Parse(inputParse.Text, new Util.Span(lastStartNode.Span.Start, advanceNode.Span.End), lastStartType, checkProbabilities[1], headRules.GetHead(constituent, lastStartType))); newParsesList.Add(newParse2); } } if (checkProbabilities[incompleteIndex] > qOpp) { //make sure a shift is likely if (CreateDerivationString) { newParse1.AppendDerivationBuffer("0"); newParse1.AppendDerivationBuffer("."); } if (advanceNodeIndex != nodeCount - 1) { //can't shift last element newParse1.AddProbability(Math.Log(checkProbabilities[0])); newParsesList.Add(newParse1); } } } Parse[] newParses = newParsesList.ToArray(); return(newParses); }
/// <summary> /// Generates a Parse structure from the specified tree-bank style parse string. /// </summary> /// <param name="parse"> /// A tree-bank style parse string. /// </param> /// <returns> /// a Parse structure for the specified tree-bank style parse string. /// </returns> public static Parse FromParseString(string parse) { StringBuilder textBuffer = new StringBuilder(); int offset = 0; Stack<Util.Pair<string, int>> parseStack = new Stack<Util.Pair<string, int>>(); List<Util.Pair<string, Util.Span>> consitutents = new List<Util.Pair<string, Util.Span>>(); for (int currentChar = 0, charCount = parse.Length; currentChar < charCount; currentChar++) { char c = parse[currentChar]; if (c == '(') { string rest = parse.Substring(currentChar + 1); string type = GetType(rest); if (type == null) { throw new ParseException("null type for: " + rest); } string token = GetToken(rest); parseStack.Push(new Util.Pair<string, int>(type, offset)); if ((object) token != null && type != "-NONE-") { consitutents.Add(new Util.Pair<string, Util.Span>(MaximumEntropyParser.TokenNode, new Util.Span(offset, offset + token.Length))); textBuffer.Append(token).Append(" "); offset += token.Length + 1; } } else if (c == ')') { Util.Pair<string, int> parts = parseStack.Pop(); string type = parts.FirstValue; if (type != "-NONE-") { int start = parts.SecondValue; consitutents.Add(new Util.Pair<string, Util.Span>(parts.FirstValue, new Util.Span(start, offset - 1))); } } } string text = textBuffer.ToString(); Parse rootParse = new Parse(text, new Util.Span(0, text.Length), MaximumEntropyParser.TopNode, 1); for (int currentConstituent = 0, constituentCount = consitutents.Count; currentConstituent < constituentCount; currentConstituent++) { Util.Pair<string, Util.Span> parts = consitutents[currentConstituent]; string type = parts.FirstValue; if (type != MaximumEntropyParser.TopNode) { Parse newConstituent = new Parse(text, parts.SecondValue, type, 1); rootParse.Insert(newConstituent); } } return rootParse; }
public Parse[] DoParse(string line, int requestedParses) { System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder(); string[] rawTokens = mTokenizer.Tokenize(line); ArrayList tokens = new ArrayList(); foreach (string rawToken in rawTokens) { string convertedToken = ConvertToken(rawToken); tokens.Add(convertedToken); lineBuilder.Append(convertedToken).Append(" "); } if (lineBuilder.Length != 0) { string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString(); Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null); int start = 0; foreach (string token in tokens) { currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0)); start += token.Length + 1; } Parse[] parses = mParser.FullParse(currentParse, requestedParses); return parses; } else { return null; } }
public string DoParse(string[] lines, int requestedParses) { System.Text.StringBuilder parseStringBuilder = new System.Text.StringBuilder(); foreach (string line in lines) { System.Text.StringBuilder lineBuilder = new System.Text.StringBuilder(); string[] rawTokens = mTokenizer.Tokenize(line); ArrayList tokens = new ArrayList(); foreach (string rawToken in rawTokens) { string convertedToken = ConvertToken(rawToken); tokens.Add(convertedToken); lineBuilder.Append(convertedToken).Append(" "); } if (lineBuilder.Length != 0) { string text = lineBuilder.ToString(0, lineBuilder.Length - 1).ToString(); Parse currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null); int start = 0; foreach (string token in tokens) { currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0)); start += token.Length + 1; } Parse[] parses = mParser.FullParse(currentParse, requestedParses); for (int currentParseIndex = 0, parseCount = parses.Length; currentParseIndex < parseCount; currentParseIndex++) { if (requestedParses > 1) { lineBuilder.Append(currentParse.ToString() + " " + parses[currentParseIndex].Probability.ToString(System.Globalization.CultureInfo.InvariantCulture) + " "); } lineBuilder.Append(parses[currentParseIndex].Show()); parseStringBuilder.Append(lineBuilder.ToString()); } } else { parseStringBuilder.Append("\r\n"); } } return parseStringBuilder.ToString(); }
private Parse[] DoParse(IEnumerable<string> tokens, int requestedParses) { var lineBuilder = new System.Text.StringBuilder(); var convertedTokens = new List<string>(); foreach (string rawToken in tokens) { string convertedToken = ConvertToken(rawToken); convertedTokens.Add(convertedToken); lineBuilder.Append(convertedToken).Append(" "); } if (lineBuilder.Length != 0) { string text = lineBuilder.ToString(0, lineBuilder.Length - 1); var currentParse = new Parse(text, new Util.Span(0, text.Length), "INC", 1, null); int start = 0; foreach (string token in convertedTokens) { currentParse.Insert(new Parse(text, new Util.Span(start, start + token.Length), MaximumEntropyParser.TokenNode, 0)); start += token.Length + 1; } Parse[] parses = _parser.FullParse(currentParse, requestedParses); return parses; } else { return null; } }