private void ProcessNode(AdNode node, List <string> sentence, List <string> tags, List <string> target, String inheritedTag) { var phraseTag = GetChunkTag(node, inheritedTag, target.Count); var inherited = false; if ((phraseTag == Other /*|| phraseTag.equals(inheritedTag)*/) && inheritedTag != null) { phraseTag = inheritedTag; inherited = true; } for (var i = 0; i < node.Elements.Count; i++) { if (node.Elements[i].IsLeaf) { var isIntermediate = false; var tag = phraseTag; var leaf = (AdLeaf)node.Elements[i]; var localChunk = GetChunkTag(leaf); if (localChunk != null && !tag.Equals(localChunk)) { tag = localChunk; } if (IsIntermediate(tags, target, tag) && (inherited || i > 0)) { isIntermediate = true; } if (!IncludePunctuations && leaf.FunctionalTag == null && ( !(i + 1 < node.Elements.Count && node.Elements[i + 1].IsLeaf) || !(i > 0 && node.Elements[i - 1].IsLeaf))) { isIntermediate = false; tag = Other; } ProcessLeaf(leaf, isIntermediate, tag, sentence, tags, target); } else { var before = target.Count; ProcessNode((AdNode)node.Elements[i], sentence, tags, target, phraseTag); // if the child node was of a different type we should break the chunk sequence for (var j = target.Count - 1; j >= before; j--) { if (!target[j].EndsWith("-" + phraseTag)) { phraseTag = Other; break; } } } } }
/// <summary> /// Recursive method to process a node in Arvores Deitadas format. /// </summary> /// <param name="node">The node to be processed.</param> /// <param name="sentence">The sentence tokens we got so far.</param> private void Process(AdNode node, List <string> sentence) { if (node == null) { return; } foreach (var element in node.Elements) { if (element.IsLeaf) { ProcessLeaf((AdLeaf)element, sentence); } else { Process((AdNode)element, sentence); } } }
/// <summary> /// Processes the root node. /// </summary> /// <param name="root">The root node.</param> /// <param name="sentence">The sentence.</param> /// <param name="tags">The tags.</param> /// <param name="target">The target.</param> protected void ProcessRoot(AdNode root, List <string> sentence, List <string> tags, List <string> target) { if (root == null) { return; } foreach (var element in root.Elements) { if (element.IsLeaf) { ProcessLeaf((AdLeaf)element, false, Other, sentence, tags, target); } else { ProcessNode((AdNode)element, sentence, tags, target, null); } } }
/// <summary> /// Gets the chunk tag. /// </summary> /// <param name="node">The node.</param> /// <param name="parent">The parent.</param> /// <param name="index">The index.</param> /// <returns>System.String.</returns> protected virtual string GetChunkTag(AdNode node, string parent, int index) { var tag = node.SyntacticTag; var phraseTag = tag.Substring(tag.LastIndexOf(":", StringComparison.Ordinal) + 1); while (phraseTag.EndsWith("-")) { phraseTag = phraseTag.Substring(0, phraseTag.Length - 1); } if (phraseTag == "adjp" && parent != "NP") { phraseTag = "np"; } // maybe we should use only np, vp and pp, but will keep ap and advp. if (phraseTag.Equals("np") || phraseTag.Equals("vp") || phraseTag.Equals("pp") || phraseTag.Equals("ap") || phraseTag.Equals("advp") // || phraseTag.equals("adjp") // || phraseTag.equals("cu") // || phraseTag.equals("sq") ) { phraseTag = phraseTag.ToUpperInvariant(); } else { phraseTag = Other; } return(phraseTag); }
/// <summary> /// Parses the specified sentence string. /// Converts the string representation of a sentence in a specified attributes and culture-specific /// format to its <see cref="AdSentence" /> equivalent. A return value indicates whether the /// conversion succeeded or failed. /// </summary> /// <param name="sentence">The sentence.</param> /// <param name="sentenceString">The sentence string.</param> /// <param name="para">The para.</param> /// <param name="isTitle">if set to <c>true</c> [is title].</param> /// <param name="isBox">if set to <c>true</c> [is box].</param> /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param> /// <param name="monitor">The evaluation monitor. This value can be a <c>null</c> value.</param> /// <returns><c>true</c> if the <paramref name="sentenceString"/> parameter was converted successfully, <c>false</c> otherwise.</returns> /// <exception cref="System.IO.InvalidDataException"> /// Something went wrong. /// </exception> /// <exception cref="System.InvalidOperationException">Should not happen!</exception> public static bool TryParse( out AdSentence sentence, string sentenceString, int para, bool isTitle, bool isBox, bool safeParse, Monitor monitor) { string text = null; string meta = null; var sent = new AdSentence(); try { using (var reader = new StringReader(sentenceString)) { // first line is <s ...> var line = reader.ReadLine(); if (line == null) { sentence = null; return(false); } var useSameTextAndMeta = false; // to handle cases where there are diff sug of parse (&&) while (!line.StartsWith("SOURCE")) { if (line.Equals("&&")) { useSameTextAndMeta = true; break; } line = reader.ReadLine(); if (line == null) { sentence = null; return(false); } } if (!useSameTextAndMeta) { var metaFromSource = line.Substring(7); line = reader.ReadLine(); if (line == null) { sentence = null; return(false); } var start = line.IndexOf(" ", StringComparison.InvariantCulture); text = FixPunctuation(line.Substring(start + 1).Trim()); if (start > 0) { meta = line.Substring(0, start) + " p=" + para; if (isTitle) { meta += " title"; } if (isBox) { meta += " box"; } meta += metaFromSource; } else { // rare case were there is no space between id and the sentence. if (monitor != null) { monitor.OnWarning("A sentence was skipped due a possible integrity loss."); } // The OpenNLP uses previous meta, but its better to just ignore the sentence // since the previous meta its not related to the current. sentence = null; return(false); } } sent.Text = text; sent.Metadata = meta; // skip lines starting with ### line = reader.ReadLine(); while (line != null && line.StartsWith("###")) { line = reader.ReadLine(); } var nodeStack = new List <AdNode>(); sent.Root = new AdNode { SyntacticTag = "ROOT", Level = 0 }; nodeStack.Add(sent.Root); while (!string.IsNullOrEmpty(line) && !line.StartsWith("</s>") && !line.Equals("&&")) { AdTreeElement element; if (TryParseElement(out element, line, safeParse, monitor)) { // The idea here is to keep a stack of nodes that are candidates for // parenting the following elements (nodes and leafs). // 1) When we get a new element, we check its level and remove from // the top of the stack nodes that are brothers or nephews. while (nodeStack.Count != 0 && element.Level > 0 && element.Level <= nodeStack[nodeStack.Count - 1].Level) { nodeStack.RemoveAt(nodeStack.Count - 1); // pop } if (element.IsLeaf) { // 2b) There are parent candidates. // look for the node with the correct level if (element.Level == 0) { nodeStack[0].Elements.Add(element); } else { var peek = nodeStack[nodeStack.Count - 1]; var index = nodeStack.Count - 1; AdNode parent = null; while (parent == null) { if (peek.Level < element.Level) { parent = peek; break; } index--; if (index > -1) { peek = nodeStack[index]; } else { parent = nodeStack[0]; } } parent.AddElement(element); } } else { // 3) Check if the element that is at the top of the stack is this // node parent, if yes add it as a son if (nodeStack.Count != 0 && nodeStack[nodeStack.Count - 1].Level < element.Level) { nodeStack[nodeStack.Count - 1].AddElement(element); } else { throw new InvalidOperationException("Should not happen!"); } nodeStack.Add((AdNode)element); } } else if (safeParse) { // invalid element, so we skip this sentence... sentence = null; return(false); } line = reader.ReadLine(); } } } catch (Exception ex) { if (monitor != null) { monitor.OnException(new InvalidDataException("Something went wrong during the AdSentence parse.", ex)); } sentence = null; return(false); } sentence = sent; return(true); }
/// <summary> /// Converts the specified string representation of a tree element to its <see cref="AdTreeElement"/> /// equivalent and returns a value that indicates whether the conversion succeeded. /// </summary> /// <param name="element"> /// When this method returns, contains the <see cref="AdTreeElement"/> value equivalent to the element /// contained in <paramref name="line"/>, if the conversion succeeded, or <c>null</c> if the conversion /// failed. The conversion fails if the <paramref name="line"/> parameter is null, is an empty string (""), /// or does not contain a valid string representation of a AdElement. This parameter is passed /// uninitialized. /// </param> /// <param name="line">The string representation of the element.</param> /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param> /// <param name="monitor">The evaluation monitor.</param> /// <returns><c>true</c> if the s parameter was converted successfully; otherwise, <c>false</c>.</returns> private static bool TryParseElement(out AdTreeElement element, string line, bool safeParse, Monitor monitor) { var m = nodePattern.Match(line); if (m.Success) { element = new AdNode { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value }; return(true); } m = leafPattern.Match(line); if (m.Success) { element = new AdLeaf { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value, FunctionalTag = m.Groups[3].Value, Lemma = m.Groups[4].Value, SecondaryTag = m.Groups[5].Value, MorphologicalTag = m.Groups[6].Value, Lexeme = m.Groups[7].Value }; return(true); } m = punctuationPattern.Match(line); if (m.Success) { element = new AdLeaf { Level = m.Groups[1].Length + 1, Lexeme = m.Groups[2].Value }; return(true); } if (safeParse) { element = null; return(false); } // Knuppe: The most bizarre cases I found, were invalid data (like HTML, inside the sentences) // so I decided to implement the safeParse attribute, to ignore this junk... // // I think any program should adapt to an error in a file. otherwise the files will never // be fixed... // process the bizarre cases. if (line.Equals("_") || line.StartsWith("<lixo") || line.StartsWith("pause")) { element = null; return(false); } if (line.StartsWith("=")) { m = bizarreLeafPattern.Match(line); if (m.Success) { var leaf = new AdLeaf { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value, Lemma = m.Groups[3].Value, MorphologicalTag = m.Groups[4].Value, Lexeme = m.Groups[5].Value }; if (!string.IsNullOrEmpty(leaf.Lemma) && leaf.Lemma.Length > 2) { leaf.Lemma = leaf.Lemma.Substring(1); } element = leaf; return(true); } var level = line.LastIndexOf("=", StringComparison.InvariantCulture) + 1; if (level > 0 && level < line.Length - 2 && Regex.IsMatch(line.Substring(level + 1), "\\w.*?[\\.<>].*")) { element = new AdLeaf { Level = level + 1, Lexeme = line.Substring(level + 1) }; return(true); } } if (monitor != null) { monitor.OnWarning("Couldn't parse leaf: " + line); } element = null; return(false); }
/// <summary> /// Converts the specified string representation of a tree element to its <see cref="AdTreeElement"/> /// equivalent and returns a value that indicates whether the conversion succeeded. /// </summary> /// <param name="element"> /// When this method returns, contains the <see cref="AdTreeElement"/> value equivalent to the element /// contained in <paramref name="line"/>, if the conversion succeeded, or <c>null</c> if the conversion /// failed. The conversion fails if the <paramref name="line"/> parameter is null, is an empty string (""), /// or does not contain a valid string representation of a AdElement. This parameter is passed /// uninitialized. /// </param> /// <param name="line">The string representation of the element.</param> /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param> /// <param name="monitor">The evaluation monitor.</param> /// <returns><c>true</c> if the s parameter was converted successfully; otherwise, <c>false</c>.</returns> private static bool TryParseElement(out AdTreeElement element, string line, bool safeParse, Monitor monitor) { var m = nodePattern.Match(line); if (m.Success) { element = new AdNode { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value }; return true; } m = leafPattern.Match(line); if (m.Success) { element = new AdLeaf { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value, FunctionalTag = m.Groups[3].Value, Lemma = m.Groups[4].Value, SecondaryTag = m.Groups[5].Value, MorphologicalTag = m.Groups[6].Value, Lexeme = m.Groups[7].Value }; return true; } m = punctuationPattern.Match(line); if (m.Success) { element = new AdLeaf { Level = m.Groups[1].Length + 1, Lexeme = m.Groups[2].Value }; return true; } if (safeParse) { element = null; return false; } // Knuppe: The most bizarre cases I found, were invalid data (like HTML, inside the sentences) // so I decided to implement the safeParse attribute, to ignore this junk... // // I think any program should adapt to an error in a file. otherwise the files will never // be fixed... // process the bizarre cases. if (line.Equals("_") || line.StartsWith("<lixo") || line.StartsWith("pause")) { element = null; return false; } if (line.StartsWith("=")) { m = bizarreLeafPattern.Match(line); if (m.Success) { var leaf = new AdLeaf { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value, Lemma = m.Groups[3].Value, MorphologicalTag = m.Groups[4].Value, Lexeme = m.Groups[5].Value }; if (!string.IsNullOrEmpty(leaf.Lemma) && leaf.Lemma.Length > 2) { leaf.Lemma = leaf.Lemma.Substring(1); } element = leaf; return true; } var level = line.LastIndexOf("=", StringComparison.InvariantCulture) + 1; if (level > 0 && level < line.Length - 2 && Regex.IsMatch(line.Substring(level + 1), "\\w.*?[\\.<>].*")) { element = new AdLeaf { Level = level + 1, Lexeme = line.Substring(level + 1) }; return true; } } if (monitor != null) { monitor.OnWarning("Couldn't parse leaf: " + line); } element = null; return false; }