/// <summary> /// Process a Leaf of Arvores Detaitadas format. /// </summary> /// <param name="leaf">The leaf to be processed.</param> /// <param name="sentence">The sentence tokens we got so far.</param> private void ProcessLeaf(AdLeaf leaf, List <string> sentence) { if (leaf == null) { return; } var alreadyAdded = false; if (leftContractionPart != null) { // will handle the contraction var c = PortugueseContractionUtility.ToContraction(leftContractionPart, leaf.Lexeme); if (c != null) { sentence.AddRange(c.RegExSplit(Expressions.Expression.Space)); alreadyAdded = true; } else { // contraction was missing! why? sentence.Add(leftContractionPart); if (monitor != null) { monitor.OnWarning(string.Format("Missing contraction for: {0} - {1}", leftContractionPart, leaf.Lexeme)); } // keep alreadyAdded false. } leftContractionPart = null; } var leafTag = leaf.SecondaryTag; if (leafTag != null) { if (leafTag.Contains("<sam->") && !alreadyAdded) { var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline); if (lexemes.Length > 1) { sentence.AddRange(lexemes.SubArray(0, lexemes.Length - 1)); } leftContractionPart = lexemes[lexemes.Length - 1]; return; } } if (!alreadyAdded) { sentence.AddRange(ProcessLexeme(leaf.Lexeme)); } }
/// <summary> /// Processes the Ad leaf. /// </summary> /// <param name="leaf">The leaf.</param> /// <param name="isIntermediate">if set to <c>true</c> leaf is intermediate.</param> /// <param name="phraseTag">The phrase tag.</param> /// <param name="sentence">The sentence.</param> /// <param name="tags">The tags.</param> /// <param name="target">The target.</param> protected void ProcessLeaf( AdLeaf leaf, bool isIntermediate, string phraseTag, List <string> sentence, List <string> tags, List <string> target) { string chunkTag; if (leaf.FunctionalTag != null && phraseTag.Equals(Other)) { phraseTag = GetPhraseTagFromPosTag(leaf.FunctionalTag); } if (!phraseTag.Equals(Other)) { if (isIntermediate) { chunkTag = "I-" + phraseTag; } else { chunkTag = "B-" + phraseTag; } } else { chunkTag = phraseTag; } sentence.Add(leaf.Lexeme); tags.Add(leaf.SecondaryTag == null ? leaf.Lexeme : leaf.FunctionalTag); target.Add(chunkTag); }
/// <summary> /// Process a Leaf of Arvores Detaitadas format. /// </summary> /// <param name="leaf">The leaf to be processed.</param> /// <param name="sentence">The sentence tokens we got so far.</param> /// <param name="names">The names we got so far.</param> private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <Span> names) { if (leaf == null) { return; } var alreadyAdded = false; if (leftContractionPart != null) { // will handle the contraction var c = PortugueseContractionUtility.ToContraction(leftContractionPart, leaf.Lexeme); if (c != null) { sentence.AddRange(c.RegExSplit(Expressions.Expression.Space)); alreadyAdded = true; } else { // contraction was missing! why? sentence.Add(leftContractionPart); // keep alreadyAdded false. } leftContractionPart = null; } string namedEntityTag = null; var startOfNamedEntity = -1; var leafTag = leaf.SecondaryTag; var expandLastNER = false; // used when we find a <NER2> tag if (leafTag != null) { if (leafTag.Contains("<sam->") && !alreadyAdded) { var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline); if (lexemes.Length > 1) { sentence.AddRange(lexemes.SubArray(0, lexemes.Length - 1)); } leftContractionPart = lexemes[lexemes.Length - 1]; return; } if (leafTag.Contains("<NER2>")) { // this one an be part of the last name expandLastNER = true; } namedEntityTag = GetNER(leafTag); } if (namedEntityTag != null) { startOfNamedEntity = sentence.Count; } if (!alreadyAdded) { sentence.AddRange(ProcessLexeme(leaf.Lexeme)); } if (namedEntityTag != null) { names.Add(new Span(startOfNamedEntity, sentence.Count, namedEntityTag)); } if (expandLastNER) { // if the current leaf has the tag <NER2>, it can be the continuation of // a NER. // we check if it is true, and expand the last NER var lastIndex = names.Count - 1; var error = false; if (names.Count > 0) { var last = names[lastIndex]; if (last.End == sentence.Count - 1) { names[lastIndex] = new Span(last.Start, sentence.Count, last.Type); } else { error = true; } } else { error = true; } if (error) { // Maybe it is not the same NER, skip it. // System.err.println("Missing NER start for sentence [" + sentence // + "] node [" + leaf + "]"); } } }
/// <summary> /// Converts the specified string representation of a tree element to its <see cref="AdTreeElement"/> /// equivalent and returns a value that indicates whether the conversion succeeded. /// </summary> /// <param name="element"> /// When this method returns, contains the <see cref="AdTreeElement"/> value equivalent to the element /// contained in <paramref name="line"/>, if the conversion succeeded, or <c>null</c> if the conversion /// failed. The conversion fails if the <paramref name="line"/> parameter is null, is an empty string (""), /// or does not contain a valid string representation of a AdElement. This parameter is passed /// uninitialized. /// </param> /// <param name="line">The string representation of the element.</param> /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param> /// <param name="monitor">The evaluation monitor.</param> /// <returns><c>true</c> if the s parameter was converted successfully; otherwise, <c>false</c>.</returns> private static bool TryParseElement(out AdTreeElement element, string line, bool safeParse, Monitor monitor) { var m = nodePattern.Match(line); if (m.Success) { element = new AdNode { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value }; return(true); } m = leafPattern.Match(line); if (m.Success) { element = new AdLeaf { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value, FunctionalTag = m.Groups[3].Value, Lemma = m.Groups[4].Value, SecondaryTag = m.Groups[5].Value, MorphologicalTag = m.Groups[6].Value, Lexeme = m.Groups[7].Value }; return(true); } m = punctuationPattern.Match(line); if (m.Success) { element = new AdLeaf { Level = m.Groups[1].Length + 1, Lexeme = m.Groups[2].Value }; return(true); } if (safeParse) { element = null; return(false); } // Knuppe: The most bizarre cases I found, were invalid data (like HTML, inside the sentences) // so I decided to implement the safeParse attribute, to ignore this junk... // // I think any program should adapt to an error in a file. otherwise the files will never // be fixed... // process the bizarre cases. if (line.Equals("_") || line.StartsWith("<lixo") || line.StartsWith("pause")) { element = null; return(false); } if (line.StartsWith("=")) { m = bizarreLeafPattern.Match(line); if (m.Success) { var leaf = new AdLeaf { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value, Lemma = m.Groups[3].Value, MorphologicalTag = m.Groups[4].Value, Lexeme = m.Groups[5].Value }; if (!string.IsNullOrEmpty(leaf.Lemma) && leaf.Lemma.Length > 2) { leaf.Lemma = leaf.Lemma.Substring(1); } element = leaf; return(true); } var level = line.LastIndexOf("=", StringComparison.InvariantCulture) + 1; if (level > 0 && level < line.Length - 2 && Regex.IsMatch(line.Substring(level + 1), "\\w.*?[\\.<>].*")) { element = new AdLeaf { Level = level + 1, Lexeme = line.Substring(level + 1) }; return(true); } } if (monitor != null) { monitor.OnWarning("Couldn't parse leaf: " + line); } element = null; return(false); }
private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <string> tags, List <string> con, List <string> prop) { if (leaf == null) { return; } var lexeme = leaf.Lexeme; // this will change half of the quotation marks if ("«" == lexeme || "»" == lexeme) { if (callsCount % 2 == 0) { lexeme = "\""; } } var tag = leaf.FunctionalTag; string contraction = null; if (leaf.SecondaryTag != null) { if (leaf.SecondaryTag.Contains("<sam->")) { contraction = "B"; } else if (leaf.SecondaryTag.Contains("<-sam>")) { contraction = "E"; } } if (tag == null) { tag = lexeme; } if (includeFeatures && !string.IsNullOrEmpty(leaf.MorphologicalTag)) { tag += " " + leaf.MorphologicalTag; } tag = tag.RegExReplace(Expressions.Expression.Space, "=") ?? lexeme; //tag = tag.replaceAll("\\s+", "="); if (expandMe && lexeme.Contains("_")) { var tokenizer = new StringTokenizer(lexeme, "_"); if (tag == "prop") { sentence.Add(lexeme); tags.Add(tag); con.Add(null); prop.Add("P"); } else if (tokenizer.CountTokens > 0) { var toks = new List <string>(tokenizer.CountTokens); var tagsWithCont = new List <string>(tokenizer.CountTokens); toks.Add(tokenizer.NextToken); tagsWithCont.Add("B-" + tag); while (tokenizer.HasMoreTokens) { toks.Add(tokenizer.NextToken); tagsWithCont.Add("I-" + tag); } if (contraction != null) { con.AddRange(new string[toks.Count - 1]); con.Add(contraction); } else { con.AddRange(new string[toks.Count]); } sentence.AddRange(toks); tags.AddRange(tagsWithCont); prop.AddRange(new string[toks.Count]); } else { sentence.Add(lexeme); tags.Add(tag); prop.Add(null); con.Add(contraction); } } else if (lexeme.Contains(hyphen) && lexeme.Length > 1) { string firstTok = null; string secondTok = null; string rest = null; var match = hyphenRegex.Match(lexeme); if (match.Success) { if (match.Groups[1].Success) { firstTok = match.Groups[2].Value; } else if (match.Groups[3].Success) { secondTok = match.Groups[4].Value; rest = match.Groups[5].Value; } else if (match.Groups[6].Success) { firstTok = match.Groups[7].Value; secondTok = match.Groups[8].Value; rest = match.Groups[9].Value; } else { throw new InvalidFormatException("Wrong hyphen pattern."); } if (!string.IsNullOrEmpty(firstTok)) { sentence.Add(firstTok); tags.Add(tag); prop.Add(null); con.Add(contraction); } if (!string.IsNullOrEmpty(hyphen)) { sentence.Add(hyphen); tags.Add(hyphen); prop.Add(null); con.Add(contraction); } if (!string.IsNullOrEmpty(secondTok)) { sentence.Add(secondTok); tags.Add(tag); prop.Add(null); con.Add(contraction); } if (!string.IsNullOrEmpty(rest)) { sentence.Add(rest); tags.Add(tag); prop.Add(null); con.Add(contraction); } } else { sentence.Add(lexeme); tags.Add(tag); prop.Add(null); con.Add(contraction); } } else { tag = AddGender(tag, leaf.MorphologicalTag); sentence.Add(lexeme); tags.Add(tag); prop.Add(null); con.Add(contraction); } }
private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <Span> names) { if (leaf == null) { return; } if (leftContractionPart == null) { var leafTag = leaf.SecondaryTag; if (leafTag != null) { if (leafTag.Contains("<sam->")) { var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline); if (lexemes.Length > 1) { for (var i = 0; i < lexemes.Length - 1; i++) { sentence.Add(lexemes[i]); var expand = PortugueseContractionUtility.Expand(lexemes[i]); if (expand == null) { continue; } var end = sentence.Count; var start = end - 1; var s = new Span(start, end, "default"); names.Add(s); } } leftContractionPart = lexemes[lexemes.Length - 1]; return; } } sentence.Add(leaf.Lexeme); return; } // will handle the contraction var tag = leaf.SecondaryTag; var right = leaf.Lexeme; if (tag != null && tag.Contains("<-sam>")) { var parts = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline); if (parts != null) { // try to join only the first var c = PortugueseContractionUtility.ToContraction(leftContractionPart, parts[0]); if (c != null) { sentence.Add(c); names.Add(new Span(sentence.Count - 1, sentence.Count, "default")); } for (var i = 1; i < parts.Length; i++) { sentence.Add(parts[i]); } } else { right = leaf.Lexeme; var c = PortugueseContractionUtility.ToContraction(leftContractionPart, right); if (c != null) { sentence.Add(c); names.Add(new Span(sentence.Count - 1, sentence.Count, "default")); } else { if (monitor != null) { monitor.OnError("ContractionNameSample: Missing " + leftContractionPart + " + " + right); } sentence.Add(leftContractionPart); sentence.Add(right); } } } else { if (monitor != null) { monitor.OnError("ContractionNameSample: No match " + leftContractionPart + " + " + right); } } leftContractionPart = null; }
/// <summary> /// Converts the specified string representation of a tree element to its <see cref="AdTreeElement"/> /// equivalent and returns a value that indicates whether the conversion succeeded. /// </summary> /// <param name="element"> /// When this method returns, contains the <see cref="AdTreeElement"/> value equivalent to the element /// contained in <paramref name="line"/>, if the conversion succeeded, or <c>null</c> if the conversion /// failed. The conversion fails if the <paramref name="line"/> parameter is null, is an empty string (""), /// or does not contain a valid string representation of a AdElement. This parameter is passed /// uninitialized. /// </param> /// <param name="line">The string representation of the element.</param> /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param> /// <param name="monitor">The evaluation monitor.</param> /// <returns><c>true</c> if the s parameter was converted successfully; otherwise, <c>false</c>.</returns> private static bool TryParseElement(out AdTreeElement element, string line, bool safeParse, Monitor monitor) { var m = nodePattern.Match(line); if (m.Success) { element = new AdNode { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value }; return true; } m = leafPattern.Match(line); if (m.Success) { element = new AdLeaf { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value, FunctionalTag = m.Groups[3].Value, Lemma = m.Groups[4].Value, SecondaryTag = m.Groups[5].Value, MorphologicalTag = m.Groups[6].Value, Lexeme = m.Groups[7].Value }; return true; } m = punctuationPattern.Match(line); if (m.Success) { element = new AdLeaf { Level = m.Groups[1].Length + 1, Lexeme = m.Groups[2].Value }; return true; } if (safeParse) { element = null; return false; } // Knuppe: The most bizarre cases I found, were invalid data (like HTML, inside the sentences) // so I decided to implement the safeParse attribute, to ignore this junk... // // I think any program should adapt to an error in a file. otherwise the files will never // be fixed... // process the bizarre cases. if (line.Equals("_") || line.StartsWith("<lixo") || line.StartsWith("pause")) { element = null; return false; } if (line.StartsWith("=")) { m = bizarreLeafPattern.Match(line); if (m.Success) { var leaf = new AdLeaf { Level = m.Groups[1].Length + 1, SyntacticTag = m.Groups[2].Value, Lemma = m.Groups[3].Value, MorphologicalTag = m.Groups[4].Value, Lexeme = m.Groups[5].Value }; if (!string.IsNullOrEmpty(leaf.Lemma) && leaf.Lemma.Length > 2) { leaf.Lemma = leaf.Lemma.Substring(1); } element = leaf; return true; } var level = line.LastIndexOf("=", StringComparison.InvariantCulture) + 1; if (level > 0 && level < line.Length - 2 && Regex.IsMatch(line.Substring(level + 1), "\\w.*?[\\.<>].*")) { element = new AdLeaf { Level = level + 1, Lexeme = line.Substring(level + 1) }; return true; } } if (monitor != null) { monitor.OnWarning("Couldn't parse leaf: " + line); } element = null; return false; }
/// <summary> /// Gets the chunk tag. /// </summary> /// <param name="leaf">The leaf.</param> /// <returns>System.String.</returns> protected virtual string GetChunkTag(AdLeaf leaf) { return(leaf.SecondaryTag == "P" ? "VP" : null); }