/// <summary> /// Returns the next <see cref="T:SentenceSample"/>. Calling this method repeatedly until it returns, /// null will return each object from the underlying source exactly once. /// </summary> /// <returns> /// The next object or null to signal that the stream is exhausted. /// </returns> public SentenceSample Read() { if (sentence == null) { sentence = adSentenceStream.Read(); UpdateMeta(); if (sentence == null) { return(null); } } var document = new StringBuilder(); var sentences = new List <Span>(); do { do { if (!isTitle || (isTitle && isIncludeTitles)) { if (HasPunctuation(sentence.Text)) { var start = document.Length; document.Append(sentence.Text); sentences.Add(new Span(start, document.Length)); document.Append(' '); } } sentence = adSentenceStream.Read(); UpdateMeta(); } while (isSamePara); } while (isSameText); return(new SentenceSample( document.Length > 0 ? document.ToString(0, document.Length - 1) : document.ToString(), sentences.ToArray() )); }
private int GetTextId(AdSentence sentence) { if (corpusType == Type.None && !string.IsNullOrEmpty(sentence.Metadata)) { if (sentence.Metadata.StartsWith("LIT")) { corpusType = Type.Lit; } else if (sentence.Metadata.StartsWith("CIE")) { corpusType = Type.Cie; } else { corpusType = Type.Ama; } } Match match; string text; switch (corpusType) { case Type.Ama: match = AmaMetaRegex.Match(sentence.Metadata); if (match.Success) { return(int.Parse(match.Groups[1].Value)); } throw new InvalidFormatException("Invalid metadata: " + sentence.Metadata); case Type.Cie: match = CieMetaRegex.Match(sentence.Metadata); if (match.Success) { text = match.Groups[1].Value; if (text.Equals(textMeta2)) { return(textIdMeta2); } textIdMeta2++; textMeta2 = text; return(textIdMeta2); } throw new InvalidFormatException("Invalid metadata: " + sentence.Metadata); case Type.Lit: match = LitMetaRegex.Match(sentence.Metadata); if (match.Success) { text = match.Groups[1].Value; if (textId == textIdMeta2) { return(textIdMeta2); } textIdMeta2++; textMeta2 = text; return(textIdMeta2); } throw new InvalidFormatException("Invalid metadata: " + sentence.Metadata); default: return(0); } }
/// <summary> /// Parses the specified sentence string. /// Converts the string representation of a sentence in a specified attributes and culture-specific /// format to its <see cref="AdSentence" /> equivalent. A return value indicates whether the /// conversion succeeded or failed. /// </summary> /// <param name="sentence">The sentence.</param> /// <param name="sentenceString">The sentence string.</param> /// <param name="para">The para.</param> /// <param name="isTitle">if set to <c>true</c> [is title].</param> /// <param name="isBox">if set to <c>true</c> [is box].</param> /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param> /// <param name="monitor">The evaluation monitor. This value can be a <c>null</c> value.</param> /// <returns><c>true</c> if the <paramref name="sentenceString"/> parameter was converted successfully, <c>false</c> otherwise.</returns> /// <exception cref="System.IO.InvalidDataException"> /// Something went wrong. /// </exception> /// <exception cref="System.InvalidOperationException">Should not happen!</exception> public static bool TryParse( out AdSentence sentence, string sentenceString, int para, bool isTitle, bool isBox, bool safeParse, Monitor monitor) { string text = null; string meta = null; var sent = new AdSentence(); try { using (var reader = new StringReader(sentenceString)) { // first line is <s ...> var line = reader.ReadLine(); if (line == null) { sentence = null; return(false); } var useSameTextAndMeta = false; // to handle cases where there are diff sug of parse (&&) while (!line.StartsWith("SOURCE")) { if (line.Equals("&&")) { useSameTextAndMeta = true; break; } line = reader.ReadLine(); if (line == null) { sentence = null; return(false); } } if (!useSameTextAndMeta) { var metaFromSource = line.Substring(7); line = reader.ReadLine(); if (line == null) { sentence = null; return(false); } var start = line.IndexOf(" ", StringComparison.InvariantCulture); text = FixPunctuation(line.Substring(start + 1).Trim()); if (start > 0) { meta = line.Substring(0, start) + " p=" + para; if (isTitle) { meta += " title"; } if (isBox) { meta += " box"; } meta += metaFromSource; } else { // rare case were there is no space between id and the sentence. if (monitor != null) { monitor.OnWarning("A sentence was skipped due a possible integrity loss."); } // The OpenNLP uses previous meta, but its better to just ignore the sentence // since the previous meta its not related to the current. sentence = null; return(false); } } sent.Text = text; sent.Metadata = meta; // skip lines starting with ### line = reader.ReadLine(); while (line != null && line.StartsWith("###")) { line = reader.ReadLine(); } var nodeStack = new List <AdNode>(); sent.Root = new AdNode { SyntacticTag = "ROOT", Level = 0 }; nodeStack.Add(sent.Root); while (!string.IsNullOrEmpty(line) && !line.StartsWith("</s>") && !line.Equals("&&")) { AdTreeElement element; if (TryParseElement(out element, line, safeParse, monitor)) { // The idea here is to keep a stack of nodes that are candidates for // parenting the following elements (nodes and leafs). // 1) When we get a new element, we check its level and remove from // the top of the stack nodes that are brothers or nephews. while (nodeStack.Count != 0 && element.Level > 0 && element.Level <= nodeStack[nodeStack.Count - 1].Level) { nodeStack.RemoveAt(nodeStack.Count - 1); // pop } if (element.IsLeaf) { // 2b) There are parent candidates. // look for the node with the correct level if (element.Level == 0) { nodeStack[0].Elements.Add(element); } else { var peek = nodeStack[nodeStack.Count - 1]; var index = nodeStack.Count - 1; AdNode parent = null; while (parent == null) { if (peek.Level < element.Level) { parent = peek; break; } index--; if (index > -1) { peek = nodeStack[index]; } else { parent = nodeStack[0]; } } parent.AddElement(element); } } else { // 3) Check if the element that is at the top of the stack is this // node parent, if yes add it as a son if (nodeStack.Count != 0 && nodeStack[nodeStack.Count - 1].Level < element.Level) { nodeStack[nodeStack.Count - 1].AddElement(element); } else { throw new InvalidOperationException("Should not happen!"); } nodeStack.Add((AdNode)element); } } else if (safeParse) { // invalid element, so we skip this sentence... sentence = null; return(false); } line = reader.ReadLine(); } } } catch (Exception ex) { if (monitor != null) { monitor.OnException(new InvalidDataException("Something went wrong during the AdSentence parse.", ex)); } sentence = null; return(false); } sentence = sent; return(true); }
/// <summary> /// Parses the specified sentence string. /// Converts the string representation of a sentence in a specified attributes and culture-specific /// format to its <see cref="AdSentence" /> equivalent. A return value indicates whether the /// conversion succeeded or failed. /// </summary> /// <param name="sentence">The sentence.</param> /// <param name="sentenceString">The sentence string.</param> /// <param name="para">The para.</param> /// <param name="isTitle">if set to <c>true</c> [is title].</param> /// <param name="isBox">if set to <c>true</c> [is box].</param> /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param> /// <param name="monitor">The evaluation monitor. This value can be a <c>null</c> value.</param> /// <returns><c>true</c> if the <paramref name="sentenceString"/> parameter was converted successfully, <c>false</c> otherwise.</returns> /// <exception cref="System.IO.InvalidDataException"> /// Something went wrong. /// </exception> /// <exception cref="System.InvalidOperationException">Should not happen!</exception> public static bool TryParse( out AdSentence sentence, string sentenceString, int para, bool isTitle, bool isBox, bool safeParse, Monitor monitor) { string text = null; string meta = null; var sent = new AdSentence(); try { using (var reader = new StringReader(sentenceString)) { // first line is <s ...> var line = reader.ReadLine(); if (line == null) { sentence = null; return false; } var useSameTextAndMeta = false; // to handle cases where there are diff sug of parse (&&) while (!line.StartsWith("SOURCE")) { if (line.Equals("&&")) { useSameTextAndMeta = true; break; } line = reader.ReadLine(); if (line == null) { sentence = null; return false; } } if (!useSameTextAndMeta) { var metaFromSource = line.Substring(7); line = reader.ReadLine(); if (line == null) { sentence = null; return false; } var start = line.IndexOf(" ", StringComparison.InvariantCulture); text = FixPunctuation(line.Substring(start + 1).Trim()); if (start > 0) { meta = line.Substring(0, start) + " p=" + para; if (isTitle) meta += " title"; if (isBox) meta += " box"; meta += metaFromSource; } else { // rare case were there is no space between id and the sentence. if (monitor != null) monitor.OnWarning("A sentence was skipped due a possible integrity loss."); // The OpenNLP uses previous meta, but its better to just ignore the sentence // since the previous meta its not related to the current. sentence = null; return false; } } sent.Text = text; sent.Metadata = meta; // skip lines starting with ### line = reader.ReadLine(); while (line != null && line.StartsWith("###")) { line = reader.ReadLine(); } var nodeStack = new List<AdNode>(); sent.Root = new AdNode { SyntacticTag = "ROOT", Level = 0 }; nodeStack.Add(sent.Root); while (!string.IsNullOrEmpty(line) && !line.StartsWith("</s>") && !line.Equals("&&")) { AdTreeElement element; if (TryParseElement(out element, line, safeParse, monitor)) { // The idea here is to keep a stack of nodes that are candidates for // parenting the following elements (nodes and leafs). // 1) When we get a new element, we check its level and remove from // the top of the stack nodes that are brothers or nephews. while (nodeStack.Count != 0 && element.Level > 0 && element.Level <= nodeStack[nodeStack.Count - 1].Level) { nodeStack.RemoveAt(nodeStack.Count - 1); // pop } if (element.IsLeaf) { // 2b) There are parent candidates. // look for the node with the correct level if (element.Level == 0) { nodeStack[0].Elements.Add(element); } else { var peek = nodeStack[nodeStack.Count - 1]; var index = nodeStack.Count - 1; AdNode parent = null; while (parent == null) { if (peek.Level < element.Level) { parent = peek; break; } index--; if (index > -1) { peek = nodeStack[index]; } else { parent = nodeStack[0]; } } parent.AddElement(element); } } else { // 3) Check if the element that is at the top of the stack is this // node parent, if yes add it as a son if (nodeStack.Count != 0 && nodeStack[nodeStack.Count - 1].Level < element.Level) { nodeStack[nodeStack.Count - 1].AddElement(element); } else { throw new InvalidOperationException("Should not happen!"); } nodeStack.Add((AdNode) element); } } else if (safeParse) { // invalid element, so we skip this sentence... sentence = null; return false; } line = reader.ReadLine(); } } } catch (Exception ex) { if (monitor != null) monitor.OnException(new InvalidDataException("Something went wrong during the AdSentence parse.", ex)); sentence = null; return false; } sentence = sent; return true; }