/// <summary> /// Process a Leaf of Arvores Detaitadas format. /// </summary> /// <param name="leaf">The leaf to be processed.</param> /// <param name="sentence">The sentence tokens we got so far.</param> private void ProcessLeaf(AdLeaf leaf, List <string> sentence) { if (leaf == null) { return; } var alreadyAdded = false; if (leftContractionPart != null) { // will handle the contraction var c = PortugueseContractionUtility.ToContraction(leftContractionPart, leaf.Lexeme); if (c != null) { sentence.AddRange(c.RegExSplit(Expressions.Expression.Space)); alreadyAdded = true; } else { // contraction was missing! why? sentence.Add(leftContractionPart); if (monitor != null) { monitor.OnWarning(string.Format("Missing contraction for: {0} - {1}", leftContractionPart, leaf.Lexeme)); } // keep alreadyAdded false. } leftContractionPart = null; } var leafTag = leaf.SecondaryTag; if (leafTag != null) { if (leafTag.Contains("<sam->") && !alreadyAdded) { var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline); if (lexemes.Length > 1) { sentence.AddRange(lexemes.SubArray(0, lexemes.Length - 1)); } leftContractionPart = lexemes[lexemes.Length - 1]; return; } } if (!alreadyAdded) { sentence.AddRange(ProcessLexeme(leaf.Lexeme)); } }
/// <summary> /// Process a Leaf of Arvores Detaitadas format. /// </summary> /// <param name="leaf">The leaf to be processed.</param> /// <param name="sentence">The sentence tokens we got so far.</param> /// <param name="names">The names we got so far.</param> private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <Span> names) { if (leaf == null) { return; } var alreadyAdded = false; if (leftContractionPart != null) { // will handle the contraction var c = PortugueseContractionUtility.ToContraction(leftContractionPart, leaf.Lexeme); if (c != null) { sentence.AddRange(c.RegExSplit(Expressions.Expression.Space)); alreadyAdded = true; } else { // contraction was missing! why? sentence.Add(leftContractionPart); // keep alreadyAdded false. } leftContractionPart = null; } string namedEntityTag = null; var startOfNamedEntity = -1; var leafTag = leaf.SecondaryTag; var expandLastNER = false; // used when we find a <NER2> tag if (leafTag != null) { if (leafTag.Contains("<sam->") && !alreadyAdded) { var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline); if (lexemes.Length > 1) { sentence.AddRange(lexemes.SubArray(0, lexemes.Length - 1)); } leftContractionPart = lexemes[lexemes.Length - 1]; return; } if (leafTag.Contains("<NER2>")) { // this one an be part of the last name expandLastNER = true; } namedEntityTag = GetNER(leafTag); } if (namedEntityTag != null) { startOfNamedEntity = sentence.Count; } if (!alreadyAdded) { sentence.AddRange(ProcessLexeme(leaf.Lexeme)); } if (namedEntityTag != null) { names.Add(new Span(startOfNamedEntity, sentence.Count, namedEntityTag)); } if (expandLastNER) { // if the current leaf has the tag <NER2>, it can be the continuation of // a NER. // we check if it is true, and expand the last NER var lastIndex = names.Count - 1; var error = false; if (names.Count > 0) { var last = names[lastIndex]; if (last.End == sentence.Count - 1) { names[lastIndex] = new Span(last.Start, sentence.Count, last.Type); } else { error = true; } } else { error = true; } if (error) { // Maybe it is not the same NER, skip it. // System.err.println("Missing NER start for sentence [" + sentence // + "] node [" + leaf + "]"); } } }
private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <Span> names) { if (leaf == null) { return; } if (leftContractionPart == null) { var leafTag = leaf.SecondaryTag; if (leafTag != null) { if (leafTag.Contains("<sam->")) { var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline); if (lexemes.Length > 1) { for (var i = 0; i < lexemes.Length - 1; i++) { sentence.Add(lexemes[i]); var expand = PortugueseContractionUtility.Expand(lexemes[i]); if (expand == null) { continue; } var end = sentence.Count; var start = end - 1; var s = new Span(start, end, "default"); names.Add(s); } } leftContractionPart = lexemes[lexemes.Length - 1]; return; } } sentence.Add(leaf.Lexeme); return; } // will handle the contraction var tag = leaf.SecondaryTag; var right = leaf.Lexeme; if (tag != null && tag.Contains("<-sam>")) { var parts = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline); if (parts != null) { // try to join only the first var c = PortugueseContractionUtility.ToContraction(leftContractionPart, parts[0]); if (c != null) { sentence.Add(c); names.Add(new Span(sentence.Count - 1, sentence.Count, "default")); } for (var i = 1; i < parts.Length; i++) { sentence.Add(parts[i]); } } else { right = leaf.Lexeme; var c = PortugueseContractionUtility.ToContraction(leftContractionPart, right); if (c != null) { sentence.Add(c); names.Add(new Span(sentence.Count - 1, sentence.Count, "default")); } else { if (monitor != null) { monitor.OnError("ContractionNameSample: Missing " + leftContractionPart + " + " + right); } sentence.Add(leftContractionPart); sentence.Add(right); } } } else { if (monitor != null) { monitor.OnError("ContractionNameSample: No match " + leftContractionPart + " + " + right); } } leftContractionPart = null; }