Пример #1
0
        /// <summary>
        /// Process a Leaf of Arvores Detaitadas format.
        /// </summary>
        /// <param name="leaf">The leaf to be processed.</param>
        /// <param name="sentence">The sentence tokens we got so far.</param>
        private void ProcessLeaf(AdLeaf leaf, List <string> sentence)
        {
            if (leaf == null)
            {
                return;
            }

            var alreadyAdded = false;

            if (leftContractionPart != null)
            {
                // will handle the contraction

                var c = PortugueseContractionUtility.ToContraction(leftContractionPart, leaf.Lexeme);
                if (c != null)
                {
                    sentence.AddRange(c.RegExSplit(Expressions.Expression.Space));
                    alreadyAdded = true;
                }
                else
                {
                    // contraction was missing! why?
                    sentence.Add(leftContractionPart);

                    if (monitor != null)
                    {
                        monitor.OnWarning(string.Format("Missing contraction for: {0} - {1}", leftContractionPart, leaf.Lexeme));
                    }

                    // keep alreadyAdded false.
                }
                leftContractionPart = null;
            }

            var leafTag = leaf.SecondaryTag;

            if (leafTag != null)
            {
                if (leafTag.Contains("<sam->") && !alreadyAdded)
                {
                    var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline);
                    if (lexemes.Length > 1)
                    {
                        sentence.AddRange(lexemes.SubArray(0, lexemes.Length - 1));
                    }
                    leftContractionPart = lexemes[lexemes.Length - 1];
                    return;
                }
            }

            if (!alreadyAdded)
            {
                sentence.AddRange(ProcessLexeme(leaf.Lexeme));
            }
        }
Пример #2
0
        /// <summary>
        /// Process a Leaf of Arvores Detaitadas format.
        /// </summary>
        /// <param name="leaf">The leaf to be processed.</param>
        /// <param name="sentence">The sentence tokens we got so far.</param>
        /// <param name="names">The names we got so far.</param>
        private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <Span> names)
        {
            if (leaf == null)
            {
                return;
            }

            var alreadyAdded = false;

            if (leftContractionPart != null)
            {
                // will handle the contraction

                var c = PortugueseContractionUtility.ToContraction(leftContractionPart, leaf.Lexeme);
                if (c != null)
                {
                    sentence.AddRange(c.RegExSplit(Expressions.Expression.Space));
                    alreadyAdded = true;
                }
                else
                {
                    // contraction was missing! why?
                    sentence.Add(leftContractionPart);
                    // keep alreadyAdded false.
                }
                leftContractionPart = null;
            }

            string namedEntityTag     = null;
            var    startOfNamedEntity = -1;

            var leafTag       = leaf.SecondaryTag;
            var expandLastNER = false; // used when we find a <NER2> tag

            if (leafTag != null)
            {
                if (leafTag.Contains("<sam->") && !alreadyAdded)
                {
                    var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline);
                    if (lexemes.Length > 1)
                    {
                        sentence.AddRange(lexemes.SubArray(0, lexemes.Length - 1));
                    }
                    leftContractionPart = lexemes[lexemes.Length - 1];
                    return;
                }
                if (leafTag.Contains("<NER2>"))
                {
                    // this one an be part of the last name
                    expandLastNER = true;
                }
                namedEntityTag = GetNER(leafTag);
            }

            if (namedEntityTag != null)
            {
                startOfNamedEntity = sentence.Count;
            }

            if (!alreadyAdded)
            {
                sentence.AddRange(ProcessLexeme(leaf.Lexeme));
            }

            if (namedEntityTag != null)
            {
                names.Add(new Span(startOfNamedEntity, sentence.Count, namedEntityTag));
            }

            if (expandLastNER)
            {
                // if the current leaf has the tag <NER2>, it can be the continuation of
                // a NER.
                // we check if it is true, and expand the last NER
                var lastIndex = names.Count - 1;
                var error     = false;
                if (names.Count > 0)
                {
                    var last = names[lastIndex];
                    if (last.End == sentence.Count - 1)
                    {
                        names[lastIndex] = new Span(last.Start, sentence.Count, last.Type);
                    }
                    else
                    {
                        error = true;
                    }
                }
                else
                {
                    error = true;
                }
                if (error)
                {
                    //           Maybe it is not the same NER, skip it.
                    //           System.err.println("Missing NER start for sentence [" + sentence
                    //           + "] node [" + leaf + "]");
                }
            }
        }
        private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <Span> names)
        {
            if (leaf == null)
            {
                return;
            }

            if (leftContractionPart == null)
            {
                var leafTag = leaf.SecondaryTag;

                if (leafTag != null)
                {
                    if (leafTag.Contains("<sam->"))
                    {
                        var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline);
                        if (lexemes.Length > 1)
                        {
                            for (var i = 0; i < lexemes.Length - 1; i++)
                            {
                                sentence.Add(lexemes[i]);

                                var expand = PortugueseContractionUtility.Expand(lexemes[i]);
                                if (expand == null)
                                {
                                    continue;
                                }

                                var end   = sentence.Count;
                                var start = end - 1;
                                var s     = new Span(start, end, "default");
                                names.Add(s);
                            }
                        }
                        leftContractionPart = lexemes[lexemes.Length - 1];
                        return;
                    }
                }
                sentence.Add(leaf.Lexeme);
                return;
            }

            // will handle the contraction
            var tag   = leaf.SecondaryTag;
            var right = leaf.Lexeme;

            if (tag != null && tag.Contains("<-sam>"))
            {
                var parts = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline);
                if (parts != null)
                {
                    // try to join only the first
                    var c = PortugueseContractionUtility.ToContraction(leftContractionPart, parts[0]);

                    if (c != null)
                    {
                        sentence.Add(c);
                        names.Add(new Span(sentence.Count - 1, sentence.Count, "default"));
                    }

                    for (var i = 1; i < parts.Length; i++)
                    {
                        sentence.Add(parts[i]);
                    }
                }
                else
                {
                    right = leaf.Lexeme;
                    var c = PortugueseContractionUtility.ToContraction(leftContractionPart, right);

                    if (c != null)
                    {
                        sentence.Add(c);
                        names.Add(new Span(sentence.Count - 1, sentence.Count, "default"));
                    }
                    else
                    {
                        if (monitor != null)
                        {
                            monitor.OnError("ContractionNameSample: Missing " + leftContractionPart + " + " + right);
                        }
                        sentence.Add(leftContractionPart);
                        sentence.Add(right);
                    }
                }
            }
            else
            {
                if (monitor != null)
                {
                    monitor.OnError("ContractionNameSample: No match " + leftContractionPart + " + " + right);
                }
            }
            leftContractionPart = null;
        }