Beispiel #1
0
        /// <summary>
        /// Process a Leaf of Arvores Detaitadas format.
        /// </summary>
        /// <param name="leaf">The leaf to be processed.</param>
        /// <param name="sentence">The sentence tokens we got so far.</param>
        private void ProcessLeaf(AdLeaf leaf, List <string> sentence)
        {
            if (leaf == null)
            {
                return;
            }

            var alreadyAdded = false;

            if (leftContractionPart != null)
            {
                // will handle the contraction

                var c = PortugueseContractionUtility.ToContraction(leftContractionPart, leaf.Lexeme);
                if (c != null)
                {
                    sentence.AddRange(c.RegExSplit(Expressions.Expression.Space));
                    alreadyAdded = true;
                }
                else
                {
                    // contraction was missing! why?
                    sentence.Add(leftContractionPart);

                    if (monitor != null)
                    {
                        monitor.OnWarning(string.Format("Missing contraction for: {0} - {1}", leftContractionPart, leaf.Lexeme));
                    }

                    // keep alreadyAdded false.
                }
                leftContractionPart = null;
            }

            var leafTag = leaf.SecondaryTag;

            if (leafTag != null)
            {
                if (leafTag.Contains("<sam->") && !alreadyAdded)
                {
                    var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline);
                    if (lexemes.Length > 1)
                    {
                        sentence.AddRange(lexemes.SubArray(0, lexemes.Length - 1));
                    }
                    leftContractionPart = lexemes[lexemes.Length - 1];
                    return;
                }
            }

            if (!alreadyAdded)
            {
                sentence.AddRange(ProcessLexeme(leaf.Lexeme));
            }
        }
Beispiel #2
0
        /// <summary>
        /// Processes the Ad leaf.
        /// </summary>
        /// <param name="leaf">The leaf.</param>
        /// <param name="isIntermediate">if set to <c>true</c> leaf is intermediate.</param>
        /// <param name="phraseTag">The phrase tag.</param>
        /// <param name="sentence">The sentence.</param>
        /// <param name="tags">The tags.</param>
        /// <param name="target">The target.</param>
        protected void ProcessLeaf(
            AdLeaf leaf,
            bool isIntermediate,
            string phraseTag,
            List <string> sentence,
            List <string> tags,
            List <string> target)
        {
            string chunkTag;

            if (leaf.FunctionalTag != null && phraseTag.Equals(Other))
            {
                phraseTag = GetPhraseTagFromPosTag(leaf.FunctionalTag);
            }

            if (!phraseTag.Equals(Other))
            {
                if (isIntermediate)
                {
                    chunkTag = "I-" + phraseTag;
                }
                else
                {
                    chunkTag = "B-" + phraseTag;
                }
            }
            else
            {
                chunkTag = phraseTag;
            }

            sentence.Add(leaf.Lexeme);

            tags.Add(leaf.SecondaryTag == null ? leaf.Lexeme : leaf.FunctionalTag);

            target.Add(chunkTag);
        }
Beispiel #3
0
        /// <summary>
        /// Process a Leaf of Arvores Detaitadas format.
        /// </summary>
        /// <param name="leaf">The leaf to be processed.</param>
        /// <param name="sentence">The sentence tokens we got so far.</param>
        /// <param name="names">The names we got so far.</param>
        private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <Span> names)
        {
            if (leaf == null)
            {
                return;
            }

            var alreadyAdded = false;

            if (leftContractionPart != null)
            {
                // will handle the contraction

                var c = PortugueseContractionUtility.ToContraction(leftContractionPart, leaf.Lexeme);
                if (c != null)
                {
                    sentence.AddRange(c.RegExSplit(Expressions.Expression.Space));
                    alreadyAdded = true;
                }
                else
                {
                    // contraction was missing! why?
                    sentence.Add(leftContractionPart);
                    // keep alreadyAdded false.
                }
                leftContractionPart = null;
            }

            string namedEntityTag     = null;
            var    startOfNamedEntity = -1;

            var leafTag       = leaf.SecondaryTag;
            var expandLastNER = false; // used when we find a <NER2> tag

            if (leafTag != null)
            {
                if (leafTag.Contains("<sam->") && !alreadyAdded)
                {
                    var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline);
                    if (lexemes.Length > 1)
                    {
                        sentence.AddRange(lexemes.SubArray(0, lexemes.Length - 1));
                    }
                    leftContractionPart = lexemes[lexemes.Length - 1];
                    return;
                }
                if (leafTag.Contains("<NER2>"))
                {
                    // this one an be part of the last name
                    expandLastNER = true;
                }
                namedEntityTag = GetNER(leafTag);
            }

            if (namedEntityTag != null)
            {
                startOfNamedEntity = sentence.Count;
            }

            if (!alreadyAdded)
            {
                sentence.AddRange(ProcessLexeme(leaf.Lexeme));
            }

            if (namedEntityTag != null)
            {
                names.Add(new Span(startOfNamedEntity, sentence.Count, namedEntityTag));
            }

            if (expandLastNER)
            {
                // if the current leaf has the tag <NER2>, it can be the continuation of
                // a NER.
                // we check if it is true, and expand the last NER
                var lastIndex = names.Count - 1;
                var error     = false;
                if (names.Count > 0)
                {
                    var last = names[lastIndex];
                    if (last.End == sentence.Count - 1)
                    {
                        names[lastIndex] = new Span(last.Start, sentence.Count, last.Type);
                    }
                    else
                    {
                        error = true;
                    }
                }
                else
                {
                    error = true;
                }
                if (error)
                {
                    //           Maybe it is not the same NER, skip it.
                    //           System.err.println("Missing NER start for sentence [" + sentence
                    //           + "] node [" + leaf + "]");
                }
            }
        }
Beispiel #4
0
        /// <summary>
        /// Converts the specified string representation of a tree element to its <see cref="AdTreeElement"/>
        /// equivalent and returns a value that indicates whether the conversion succeeded.
        /// </summary>
        /// <param name="element">
        /// When this method returns, contains the <see cref="AdTreeElement"/> value equivalent to the element
        /// contained in <paramref name="line"/>, if the conversion succeeded, or <c>null</c> if the conversion
        /// failed. The conversion fails if the <paramref name="line"/> parameter is null, is an empty string (""),
        /// or does not contain a valid string representation of a AdElement. This parameter is passed
        /// uninitialized.
        /// </param>
        /// <param name="line">The string representation of the element.</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param>
        /// <param name="monitor">The evaluation monitor.</param>
        /// <returns><c>true</c> if the s parameter was converted successfully; otherwise, <c>false</c>.</returns>
        private static bool TryParseElement(out AdTreeElement element, string line, bool safeParse, Monitor monitor)
        {
            var m = nodePattern.Match(line);

            if (m.Success)
            {
                element = new AdNode {
                    Level        = m.Groups[1].Length + 1,
                    SyntacticTag = m.Groups[2].Value
                };
                return(true);
            }

            m = leafPattern.Match(line);
            if (m.Success)
            {
                element = new AdLeaf {
                    Level            = m.Groups[1].Length + 1,
                    SyntacticTag     = m.Groups[2].Value,
                    FunctionalTag    = m.Groups[3].Value,
                    Lemma            = m.Groups[4].Value,
                    SecondaryTag     = m.Groups[5].Value,
                    MorphologicalTag = m.Groups[6].Value,
                    Lexeme           = m.Groups[7].Value
                };
                return(true);
            }

            m = punctuationPattern.Match(line);
            if (m.Success)
            {
                element = new AdLeaf {
                    Level  = m.Groups[1].Length + 1,
                    Lexeme = m.Groups[2].Value
                };
                return(true);
            }

            if (safeParse)
            {
                element = null;
                return(false);
            }

            // Knuppe: The most bizarre cases I found, were invalid data (like HTML, inside the sentences)
            //         so I decided to implement the safeParse attribute, to ignore this junk...
            //
            //         I think any program should adapt to an error in a file. otherwise the files will never
            //         be fixed...

            // process the bizarre cases.
            if (line.Equals("_") || line.StartsWith("<lixo") || line.StartsWith("pause"))
            {
                element = null;
                return(false);
            }

            if (line.StartsWith("="))
            {
                m = bizarreLeafPattern.Match(line);
                if (m.Success)
                {
                    var leaf = new AdLeaf {
                        Level            = m.Groups[1].Length + 1,
                        SyntacticTag     = m.Groups[2].Value,
                        Lemma            = m.Groups[3].Value,
                        MorphologicalTag = m.Groups[4].Value,
                        Lexeme           = m.Groups[5].Value
                    };

                    if (!string.IsNullOrEmpty(leaf.Lemma) && leaf.Lemma.Length > 2)
                    {
                        leaf.Lemma = leaf.Lemma.Substring(1);
                    }
                    element = leaf;
                    return(true);
                }

                var level = line.LastIndexOf("=", StringComparison.InvariantCulture) + 1;
                if (level > 0 && level < line.Length - 2 && Regex.IsMatch(line.Substring(level + 1), "\\w.*?[\\.<>].*"))
                {
                    element = new AdLeaf {
                        Level  = level + 1,
                        Lexeme = line.Substring(level + 1)
                    };
                    return(true);
                }
            }

            if (monitor != null)
            {
                monitor.OnWarning("Couldn't parse leaf: " + line);
            }

            element = null;
            return(false);
        }
Beispiel #5
0
        private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <string> tags, List <string> con, List <string> prop)
        {
            if (leaf == null)
            {
                return;
            }

            var lexeme = leaf.Lexeme;

            // this will change half of the quotation marks
            if ("«" == lexeme || "»" == lexeme)
            {
                if (callsCount % 2 == 0)
                {
                    lexeme = "\"";
                }
            }
            var tag = leaf.FunctionalTag;

            string contraction = null;

            if (leaf.SecondaryTag != null)
            {
                if (leaf.SecondaryTag.Contains("<sam->"))
                {
                    contraction = "B";
                }
                else if (leaf.SecondaryTag.Contains("<-sam>"))
                {
                    contraction = "E";
                }
            }

            if (tag == null)
            {
                tag = lexeme;
            }

            if (includeFeatures && !string.IsNullOrEmpty(leaf.MorphologicalTag))
            {
                tag += " " + leaf.MorphologicalTag;
            }

            tag = tag.RegExReplace(Expressions.Expression.Space, "=") ?? lexeme;
            //tag = tag.replaceAll("\\s+", "=");

            if (expandMe && lexeme.Contains("_"))
            {
                var tokenizer = new StringTokenizer(lexeme, "_");

                if (tag == "prop")
                {
                    sentence.Add(lexeme);
                    tags.Add(tag);
                    con.Add(null);
                    prop.Add("P");
                }
                else if (tokenizer.CountTokens > 0)
                {
                    var toks         = new List <string>(tokenizer.CountTokens);
                    var tagsWithCont = new List <string>(tokenizer.CountTokens);
                    toks.Add(tokenizer.NextToken);
                    tagsWithCont.Add("B-" + tag);
                    while (tokenizer.HasMoreTokens)
                    {
                        toks.Add(tokenizer.NextToken);
                        tagsWithCont.Add("I-" + tag);
                    }
                    if (contraction != null)
                    {
                        con.AddRange(new string[toks.Count - 1]);
                        con.Add(contraction);
                    }
                    else
                    {
                        con.AddRange(new string[toks.Count]);
                    }

                    sentence.AddRange(toks);
                    tags.AddRange(tagsWithCont);
                    prop.AddRange(new string[toks.Count]);
                }
                else
                {
                    sentence.Add(lexeme);
                    tags.Add(tag);
                    prop.Add(null);
                    con.Add(contraction);
                }
            }
            else if (lexeme.Contains(hyphen) && lexeme.Length > 1)
            {
                string firstTok = null;

                string secondTok = null;
                string rest      = null;

                var match = hyphenRegex.Match(lexeme);

                if (match.Success)
                {
                    if (match.Groups[1].Success)
                    {
                        firstTok = match.Groups[2].Value;
                    }
                    else if (match.Groups[3].Success)
                    {
                        secondTok = match.Groups[4].Value;
                        rest      = match.Groups[5].Value;
                    }
                    else if (match.Groups[6].Success)
                    {
                        firstTok  = match.Groups[7].Value;
                        secondTok = match.Groups[8].Value;
                        rest      = match.Groups[9].Value;
                    }
                    else
                    {
                        throw new InvalidFormatException("Wrong hyphen pattern.");
                    }

                    if (!string.IsNullOrEmpty(firstTok))
                    {
                        sentence.Add(firstTok);
                        tags.Add(tag);
                        prop.Add(null);
                        con.Add(contraction);
                    }
                    if (!string.IsNullOrEmpty(hyphen))
                    {
                        sentence.Add(hyphen);
                        tags.Add(hyphen);
                        prop.Add(null);
                        con.Add(contraction);
                    }
                    if (!string.IsNullOrEmpty(secondTok))
                    {
                        sentence.Add(secondTok);
                        tags.Add(tag);
                        prop.Add(null);
                        con.Add(contraction);
                    }
                    if (!string.IsNullOrEmpty(rest))
                    {
                        sentence.Add(rest);
                        tags.Add(tag);
                        prop.Add(null);
                        con.Add(contraction);
                    }
                }
                else
                {
                    sentence.Add(lexeme);
                    tags.Add(tag);
                    prop.Add(null);
                    con.Add(contraction);
                }
            }
            else
            {
                tag = AddGender(tag, leaf.MorphologicalTag);

                sentence.Add(lexeme);
                tags.Add(tag);
                prop.Add(null);
                con.Add(contraction);
            }
        }
        private void ProcessLeaf(AdLeaf leaf, List <string> sentence, List <Span> names)
        {
            if (leaf == null)
            {
                return;
            }

            if (leftContractionPart == null)
            {
                var leafTag = leaf.SecondaryTag;

                if (leafTag != null)
                {
                    if (leafTag.Contains("<sam->"))
                    {
                        var lexemes = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline);
                        if (lexemes.Length > 1)
                        {
                            for (var i = 0; i < lexemes.Length - 1; i++)
                            {
                                sentence.Add(lexemes[i]);

                                var expand = PortugueseContractionUtility.Expand(lexemes[i]);
                                if (expand == null)
                                {
                                    continue;
                                }

                                var end   = sentence.Count;
                                var start = end - 1;
                                var s     = new Span(start, end, "default");
                                names.Add(s);
                            }
                        }
                        leftContractionPart = lexemes[lexemes.Length - 1];
                        return;
                    }
                }
                sentence.Add(leaf.Lexeme);
                return;
            }

            // will handle the contraction
            var tag   = leaf.SecondaryTag;
            var right = leaf.Lexeme;

            if (tag != null && tag.Contains("<-sam>"))
            {
                var parts = leaf.Lexeme.RegExSplit(Expressions.Expression.Underline);
                if (parts != null)
                {
                    // try to join only the first
                    var c = PortugueseContractionUtility.ToContraction(leftContractionPart, parts[0]);

                    if (c != null)
                    {
                        sentence.Add(c);
                        names.Add(new Span(sentence.Count - 1, sentence.Count, "default"));
                    }

                    for (var i = 1; i < parts.Length; i++)
                    {
                        sentence.Add(parts[i]);
                    }
                }
                else
                {
                    right = leaf.Lexeme;
                    var c = PortugueseContractionUtility.ToContraction(leftContractionPart, right);

                    if (c != null)
                    {
                        sentence.Add(c);
                        names.Add(new Span(sentence.Count - 1, sentence.Count, "default"));
                    }
                    else
                    {
                        if (monitor != null)
                        {
                            monitor.OnError("ContractionNameSample: Missing " + leftContractionPart + " + " + right);
                        }
                        sentence.Add(leftContractionPart);
                        sentence.Add(right);
                    }
                }
            }
            else
            {
                if (monitor != null)
                {
                    monitor.OnError("ContractionNameSample: No match " + leftContractionPart + " + " + right);
                }
            }
            leftContractionPart = null;
        }
Beispiel #7
0
        /// <summary>
        /// Converts the specified string representation of a tree element to its <see cref="AdTreeElement"/> 
        /// equivalent and returns a value that indicates whether the conversion succeeded.
        /// </summary>
        /// <param name="element">
        /// When this method returns, contains the <see cref="AdTreeElement"/> value equivalent to the element 
        /// contained in <paramref name="line"/>, if the conversion succeeded, or <c>null</c> if the conversion 
        /// failed. The conversion fails if the <paramref name="line"/> parameter is null, is an empty string (""),
        /// or does not contain a valid string representation of a AdElement. This parameter is passed 
        /// uninitialized.
        /// </param>
        /// <param name="line">The string representation of the element.</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param>
        /// <param name="monitor">The evaluation monitor.</param>
        /// <returns><c>true</c> if the s parameter was converted successfully; otherwise, <c>false</c>.</returns>
        private static bool TryParseElement(out AdTreeElement element, string line, bool safeParse, Monitor monitor) {
            var m = nodePattern.Match(line);
            if (m.Success) {
                element = new AdNode {
                    Level = m.Groups[1].Length + 1,
                    SyntacticTag = m.Groups[2].Value
                };
                return true;
            }

            m = leafPattern.Match(line);
            if (m.Success) {
                element = new AdLeaf {
                    Level = m.Groups[1].Length + 1,
                    SyntacticTag = m.Groups[2].Value,
                    FunctionalTag = m.Groups[3].Value,
                    Lemma = m.Groups[4].Value,
                    SecondaryTag = m.Groups[5].Value,
                    MorphologicalTag = m.Groups[6].Value,
                    Lexeme = m.Groups[7].Value
                };
                return true;
            }

            m = punctuationPattern.Match(line);
            if (m.Success) {
                element = new AdLeaf {
                    Level = m.Groups[1].Length + 1,
                    Lexeme = m.Groups[2].Value
                };
                return true;
            }

            if (safeParse) {
                element = null;
                return false;
            }

            // Knuppe: The most bizarre cases I found, were invalid data (like HTML, inside the sentences)
            //         so I decided to implement the safeParse attribute, to ignore this junk...
            //
            //         I think any program should adapt to an error in a file. otherwise the files will never
            //         be fixed...                      

            // process the bizarre cases.
            if (line.Equals("_") || line.StartsWith("<lixo") || line.StartsWith("pause")) {
                element = null;
                return false;
            }

            if (line.StartsWith("=")) {
                m = bizarreLeafPattern.Match(line);
                if (m.Success) {
                    var leaf = new AdLeaf {
                        Level = m.Groups[1].Length + 1,
                        SyntacticTag = m.Groups[2].Value,
                        Lemma = m.Groups[3].Value,
                        MorphologicalTag = m.Groups[4].Value,
                        Lexeme = m.Groups[5].Value
                    };

                    if (!string.IsNullOrEmpty(leaf.Lemma) && leaf.Lemma.Length > 2) {
                        leaf.Lemma = leaf.Lemma.Substring(1);
                    }
                    element = leaf;
                    return true;
                }

                var level = line.LastIndexOf("=", StringComparison.InvariantCulture) + 1;
                if (level > 0 && level < line.Length - 2 && Regex.IsMatch(line.Substring(level + 1), "\\w.*?[\\.<>].*")) {
                    element = new AdLeaf {
                        Level = level + 1,
                        Lexeme = line.Substring(level + 1)
                    };
                    return true;
                }
            }

            if (monitor != null) {
                monitor.OnWarning("Couldn't parse leaf: " + line);
            }

            element = null;
            return false;
        }
Beispiel #8
0
 /// <summary>
 /// Gets the chunk tag.
 /// </summary>
 /// <param name="leaf">The leaf.</param>
 /// <returns>System.String.</returns>
 protected virtual string GetChunkTag(AdLeaf leaf)
 {
     return(leaf.SecondaryTag == "P" ? "VP" : null);
 }