コード例 #1
0
        private void ProcessNode(AdNode node, List <string> sentence, List <string> tags,
                                 List <string> target, String inheritedTag)
        {
            var phraseTag = GetChunkTag(node, inheritedTag, target.Count);

            var inherited = false;

            if ((phraseTag == Other /*|| phraseTag.equals(inheritedTag)*/) && inheritedTag != null)
            {
                phraseTag = inheritedTag;
                inherited = true;
            }

            for (var i = 0; i < node.Elements.Count; i++)
            {
                if (node.Elements[i].IsLeaf)
                {
                    var isIntermediate = false;
                    var tag            = phraseTag;
                    var leaf           = (AdLeaf)node.Elements[i];

                    var localChunk = GetChunkTag(leaf);
                    if (localChunk != null && !tag.Equals(localChunk))
                    {
                        tag = localChunk;
                    }

                    if (IsIntermediate(tags, target, tag) && (inherited || i > 0))
                    {
                        isIntermediate = true;
                    }

                    if (!IncludePunctuations && leaf.FunctionalTag == null && (
                            !(i + 1 < node.Elements.Count && node.Elements[i + 1].IsLeaf) ||
                            !(i > 0 && node.Elements[i - 1].IsLeaf)))
                    {
                        isIntermediate = false;
                        tag            = Other;
                    }
                    ProcessLeaf(leaf, isIntermediate, tag, sentence, tags, target);
                }
                else
                {
                    var before = target.Count;

                    ProcessNode((AdNode)node.Elements[i], sentence, tags, target, phraseTag);

                    // if the child node was of a different type we should break the chunk sequence
                    for (var j = target.Count - 1; j >= before; j--)
                    {
                        if (!target[j].EndsWith("-" + phraseTag))
                        {
                            phraseTag = Other;
                            break;
                        }
                    }
                }
            }
        }
コード例 #2
0
        /// <summary>
        /// Recursive method to process a node in Arvores Deitadas format.
        /// </summary>
        /// <param name="node">The node to be processed.</param>
        /// <param name="sentence">The sentence tokens we got so far.</param>
        private void Process(AdNode node, List <string> sentence)
        {
            if (node == null)
            {
                return;
            }

            foreach (var element in node.Elements)
            {
                if (element.IsLeaf)
                {
                    ProcessLeaf((AdLeaf)element, sentence);
                }
                else
                {
                    Process((AdNode)element, sentence);
                }
            }
        }
コード例 #3
0
        /// <summary>
        /// Processes the root node.
        /// </summary>
        /// <param name="root">The root node.</param>
        /// <param name="sentence">The sentence.</param>
        /// <param name="tags">The tags.</param>
        /// <param name="target">The target.</param>
        protected void ProcessRoot(AdNode root, List <string> sentence, List <string> tags, List <string> target)
        {
            if (root == null)
            {
                return;
            }

            foreach (var element in root.Elements)
            {
                if (element.IsLeaf)
                {
                    ProcessLeaf((AdLeaf)element, false, Other, sentence, tags, target);
                }
                else
                {
                    ProcessNode((AdNode)element, sentence, tags, target, null);
                }
            }
        }
コード例 #4
0
        /// <summary>
        /// Gets the chunk tag.
        /// </summary>
        /// <param name="node">The node.</param>
        /// <param name="parent">The parent.</param>
        /// <param name="index">The index.</param>
        /// <returns>System.String.</returns>
        protected virtual string GetChunkTag(AdNode node, string parent, int index)
        {
            var tag = node.SyntacticTag;

            var phraseTag = tag.Substring(tag.LastIndexOf(":", StringComparison.Ordinal) + 1);

            while (phraseTag.EndsWith("-"))
            {
                phraseTag = phraseTag.Substring(0, phraseTag.Length - 1);
            }

            if (phraseTag == "adjp" && parent != "NP")
            {
                phraseTag = "np";
            }

            // maybe we should use only np, vp and pp, but will keep ap and advp.
            if (phraseTag.Equals("np") ||
                phraseTag.Equals("vp") ||
                phraseTag.Equals("pp") ||
                phraseTag.Equals("ap") ||
                phraseTag.Equals("advp")
                // || phraseTag.equals("adjp")
                // || phraseTag.equals("cu")
                // || phraseTag.equals("sq")
                )
            {
                phraseTag = phraseTag.ToUpperInvariant();
            }
            else
            {
                phraseTag = Other;
            }

            return(phraseTag);
        }
コード例 #5
0
        /// <summary>
        /// Parses the specified sentence string.
        /// Converts the string representation of a sentence in a specified attributes and culture-specific
        /// format to its <see cref="AdSentence" /> equivalent. A return value indicates whether the
        /// conversion succeeded or failed.
        /// </summary>
        /// <param name="sentence">The sentence.</param>
        /// <param name="sentenceString">The sentence string.</param>
        /// <param name="para">The para.</param>
        /// <param name="isTitle">if set to <c>true</c> [is title].</param>
        /// <param name="isBox">if set to <c>true</c> [is box].</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param>
        /// <param name="monitor">The evaluation monitor. This value can be a <c>null</c> value.</param>
        /// <returns><c>true</c> if the <paramref name="sentenceString"/> parameter was converted successfully, <c>false</c> otherwise.</returns>
        /// <exception cref="System.IO.InvalidDataException">
        /// Something went wrong.
        /// </exception>
        /// <exception cref="System.InvalidOperationException">Should not happen!</exception>
        public static bool TryParse(
            out AdSentence sentence,
            string sentenceString,
            int para,
            bool isTitle,
            bool isBox,
            bool safeParse,
            Monitor monitor)
        {
            string text = null;
            string meta = null;
            var    sent = new AdSentence();

            try {
                using (var reader = new StringReader(sentenceString)) {
                    // first line is <s ...>
                    var line = reader.ReadLine();

                    if (line == null)
                    {
                        sentence = null;
                        return(false);
                    }

                    var useSameTextAndMeta = false; // to handle cases where there are diff sug of parse (&&)

                    while (!line.StartsWith("SOURCE"))
                    {
                        if (line.Equals("&&"))
                        {
                            useSameTextAndMeta = true;
                            break;
                        }
                        line = reader.ReadLine();

                        if (line == null)
                        {
                            sentence = null;
                            return(false);
                        }
                    }

                    if (!useSameTextAndMeta)
                    {
                        var metaFromSource = line.Substring(7);

                        line = reader.ReadLine();

                        if (line == null)
                        {
                            sentence = null;
                            return(false);
                        }

                        var start = line.IndexOf(" ", StringComparison.InvariantCulture);

                        text = FixPunctuation(line.Substring(start + 1).Trim());

                        if (start > 0)
                        {
                            meta = line.Substring(0, start) + " p=" + para;
                            if (isTitle)
                            {
                                meta += " title";
                            }

                            if (isBox)
                            {
                                meta += " box";
                            }

                            meta += metaFromSource;
                        }
                        else
                        {
                            // rare case were there is no space between id and the sentence.

                            if (monitor != null)
                            {
                                monitor.OnWarning("A sentence was skipped due a possible integrity loss.");
                            }

                            // The OpenNLP uses previous meta, but its better to just ignore the sentence
                            // since the previous meta its not related to the current.

                            sentence = null;
                            return(false);
                        }
                    }
                    sent.Text     = text;
                    sent.Metadata = meta;

                    // skip lines starting with ###
                    line = reader.ReadLine();
                    while (line != null && line.StartsWith("###"))
                    {
                        line = reader.ReadLine();
                    }

                    var nodeStack = new List <AdNode>();

                    sent.Root = new AdNode {
                        SyntacticTag = "ROOT",
                        Level        = 0
                    };

                    nodeStack.Add(sent.Root);

                    while (!string.IsNullOrEmpty(line) && !line.StartsWith("</s>") && !line.Equals("&&"))
                    {
                        AdTreeElement element;

                        if (TryParseElement(out element, line, safeParse, monitor))
                        {
                            // The idea here is to keep a stack of nodes that are candidates for
                            // parenting the following elements (nodes and leafs).

                            // 1) When we get a new element, we check its level and remove from
                            // the top of the stack nodes that are brothers or nephews.
                            while (nodeStack.Count != 0 && element.Level > 0 &&
                                   element.Level <= nodeStack[nodeStack.Count - 1].Level)
                            {
                                nodeStack.RemoveAt(nodeStack.Count - 1); // pop
                            }

                            if (element.IsLeaf)
                            {
                                // 2b) There are parent candidates.
                                // look for the node with the correct level

                                if (element.Level == 0)
                                {
                                    nodeStack[0].Elements.Add(element);
                                }
                                else
                                {
                                    var    peek   = nodeStack[nodeStack.Count - 1];
                                    var    index  = nodeStack.Count - 1;
                                    AdNode parent = null;
                                    while (parent == null)
                                    {
                                        if (peek.Level < element.Level)
                                        {
                                            parent = peek;
                                            break;
                                        }
                                        index--;
                                        if (index > -1)
                                        {
                                            peek = nodeStack[index];
                                        }
                                        else
                                        {
                                            parent = nodeStack[0];
                                        }
                                    }
                                    parent.AddElement(element);
                                }
                            }
                            else
                            {
                                // 3) Check if the element that is at the top of the stack is this
                                // node parent, if yes add it as a son

                                if (nodeStack.Count != 0 && nodeStack[nodeStack.Count - 1].Level < element.Level)
                                {
                                    nodeStack[nodeStack.Count - 1].AddElement(element);
                                }
                                else
                                {
                                    throw new InvalidOperationException("Should not happen!");
                                }

                                nodeStack.Add((AdNode)element);
                            }
                        }
                        else if (safeParse)
                        {
                            // invalid element, so we skip this sentence...
                            sentence = null;
                            return(false);
                        }
                        line = reader.ReadLine();
                    }
                }
            } catch (Exception ex) {
                if (monitor != null)
                {
                    monitor.OnException(new InvalidDataException("Something went wrong during the AdSentence parse.", ex));
                }

                sentence = null;
                return(false);
            }

            sentence = sent;
            return(true);
        }
コード例 #6
0
        /// <summary>
        /// Converts the specified string representation of a tree element to its <see cref="AdTreeElement"/>
        /// equivalent and returns a value that indicates whether the conversion succeeded.
        /// </summary>
        /// <param name="element">
        /// When this method returns, contains the <see cref="AdTreeElement"/> value equivalent to the element
        /// contained in <paramref name="line"/>, if the conversion succeeded, or <c>null</c> if the conversion
        /// failed. The conversion fails if the <paramref name="line"/> parameter is null, is an empty string (""),
        /// or does not contain a valid string representation of a AdElement. This parameter is passed
        /// uninitialized.
        /// </param>
        /// <param name="line">The string representation of the element.</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param>
        /// <param name="monitor">The evaluation monitor.</param>
        /// <returns><c>true</c> if the s parameter was converted successfully; otherwise, <c>false</c>.</returns>
        private static bool TryParseElement(out AdTreeElement element, string line, bool safeParse, Monitor monitor)
        {
            var m = nodePattern.Match(line);

            if (m.Success)
            {
                element = new AdNode {
                    Level        = m.Groups[1].Length + 1,
                    SyntacticTag = m.Groups[2].Value
                };
                return(true);
            }

            m = leafPattern.Match(line);
            if (m.Success)
            {
                element = new AdLeaf {
                    Level            = m.Groups[1].Length + 1,
                    SyntacticTag     = m.Groups[2].Value,
                    FunctionalTag    = m.Groups[3].Value,
                    Lemma            = m.Groups[4].Value,
                    SecondaryTag     = m.Groups[5].Value,
                    MorphologicalTag = m.Groups[6].Value,
                    Lexeme           = m.Groups[7].Value
                };
                return(true);
            }

            m = punctuationPattern.Match(line);
            if (m.Success)
            {
                element = new AdLeaf {
                    Level  = m.Groups[1].Length + 1,
                    Lexeme = m.Groups[2].Value
                };
                return(true);
            }

            if (safeParse)
            {
                element = null;
                return(false);
            }

            // Knuppe: The most bizarre cases I found, were invalid data (like HTML, inside the sentences)
            //         so I decided to implement the safeParse attribute, to ignore this junk...
            //
            //         I think any program should adapt to an error in a file. otherwise the files will never
            //         be fixed...

            // process the bizarre cases.
            if (line.Equals("_") || line.StartsWith("<lixo") || line.StartsWith("pause"))
            {
                element = null;
                return(false);
            }

            if (line.StartsWith("="))
            {
                m = bizarreLeafPattern.Match(line);
                if (m.Success)
                {
                    var leaf = new AdLeaf {
                        Level            = m.Groups[1].Length + 1,
                        SyntacticTag     = m.Groups[2].Value,
                        Lemma            = m.Groups[3].Value,
                        MorphologicalTag = m.Groups[4].Value,
                        Lexeme           = m.Groups[5].Value
                    };

                    if (!string.IsNullOrEmpty(leaf.Lemma) && leaf.Lemma.Length > 2)
                    {
                        leaf.Lemma = leaf.Lemma.Substring(1);
                    }
                    element = leaf;
                    return(true);
                }

                var level = line.LastIndexOf("=", StringComparison.InvariantCulture) + 1;
                if (level > 0 && level < line.Length - 2 && Regex.IsMatch(line.Substring(level + 1), "\\w.*?[\\.<>].*"))
                {
                    element = new AdLeaf {
                        Level  = level + 1,
                        Lexeme = line.Substring(level + 1)
                    };
                    return(true);
                }
            }

            if (monitor != null)
            {
                monitor.OnWarning("Couldn't parse leaf: " + line);
            }

            element = null;
            return(false);
        }
コード例 #7
0
ファイル: AdSentenceParser.cs プロジェクト: knuppe/SharpNL
        /// <summary>
        /// Converts the specified string representation of a tree element to its <see cref="AdTreeElement"/> 
        /// equivalent and returns a value that indicates whether the conversion succeeded.
        /// </summary>
        /// <param name="element">
        /// When this method returns, contains the <see cref="AdTreeElement"/> value equivalent to the element 
        /// contained in <paramref name="line"/>, if the conversion succeeded, or <c>null</c> if the conversion 
        /// failed. The conversion fails if the <paramref name="line"/> parameter is null, is an empty string (""),
        /// or does not contain a valid string representation of a AdElement. This parameter is passed 
        /// uninitialized.
        /// </param>
        /// <param name="line">The string representation of the element.</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param>
        /// <param name="monitor">The evaluation monitor.</param>
        /// <returns><c>true</c> if the s parameter was converted successfully; otherwise, <c>false</c>.</returns>
        private static bool TryParseElement(out AdTreeElement element, string line, bool safeParse, Monitor monitor) {
            var m = nodePattern.Match(line);
            if (m.Success) {
                element = new AdNode {
                    Level = m.Groups[1].Length + 1,
                    SyntacticTag = m.Groups[2].Value
                };
                return true;
            }

            m = leafPattern.Match(line);
            if (m.Success) {
                element = new AdLeaf {
                    Level = m.Groups[1].Length + 1,
                    SyntacticTag = m.Groups[2].Value,
                    FunctionalTag = m.Groups[3].Value,
                    Lemma = m.Groups[4].Value,
                    SecondaryTag = m.Groups[5].Value,
                    MorphologicalTag = m.Groups[6].Value,
                    Lexeme = m.Groups[7].Value
                };
                return true;
            }

            m = punctuationPattern.Match(line);
            if (m.Success) {
                element = new AdLeaf {
                    Level = m.Groups[1].Length + 1,
                    Lexeme = m.Groups[2].Value
                };
                return true;
            }

            if (safeParse) {
                element = null;
                return false;
            }

            // Knuppe: The most bizarre cases I found, were invalid data (like HTML, inside the sentences)
            //         so I decided to implement the safeParse attribute, to ignore this junk...
            //
            //         I think any program should adapt to an error in a file. otherwise the files will never
            //         be fixed...                      

            // process the bizarre cases.
            if (line.Equals("_") || line.StartsWith("<lixo") || line.StartsWith("pause")) {
                element = null;
                return false;
            }

            if (line.StartsWith("=")) {
                m = bizarreLeafPattern.Match(line);
                if (m.Success) {
                    var leaf = new AdLeaf {
                        Level = m.Groups[1].Length + 1,
                        SyntacticTag = m.Groups[2].Value,
                        Lemma = m.Groups[3].Value,
                        MorphologicalTag = m.Groups[4].Value,
                        Lexeme = m.Groups[5].Value
                    };

                    if (!string.IsNullOrEmpty(leaf.Lemma) && leaf.Lemma.Length > 2) {
                        leaf.Lemma = leaf.Lemma.Substring(1);
                    }
                    element = leaf;
                    return true;
                }

                var level = line.LastIndexOf("=", StringComparison.InvariantCulture) + 1;
                if (level > 0 && level < line.Length - 2 && Regex.IsMatch(line.Substring(level + 1), "\\w.*?[\\.<>].*")) {
                    element = new AdLeaf {
                        Level = level + 1,
                        Lexeme = line.Substring(level + 1)
                    };
                    return true;
                }
            }

            if (monitor != null) {
                monitor.OnWarning("Couldn't parse leaf: " + line);
            }

            element = null;
            return false;
        }