Пример #1
0
        /// <summary>
        /// Returns the next <see cref="T:SentenceSample"/>. Calling this method repeatedly until it returns,
        /// null will return each object from the underlying source exactly once.
        /// </summary>
        /// <returns>
        /// The next object or null to signal that the stream is exhausted.
        /// </returns>
        public SentenceSample Read()
        {
            if (sentence == null)
            {
                sentence = adSentenceStream.Read();
                UpdateMeta();
                if (sentence == null)
                {
                    return(null);
                }
            }

            var document  = new StringBuilder();
            var sentences = new List <Span>();

            do
            {
                do
                {
                    if (!isTitle || (isTitle && isIncludeTitles))
                    {
                        if (HasPunctuation(sentence.Text))
                        {
                            var start = document.Length;
                            document.Append(sentence.Text);
                            sentences.Add(new Span(start, document.Length));
                            document.Append(' ');
                        }
                    }
                    sentence = adSentenceStream.Read();
                    UpdateMeta();
                } while (isSamePara);
            } while (isSameText);

            return(new SentenceSample(
                       document.Length > 0 ? document.ToString(0, document.Length - 1) : document.ToString(),
                       sentences.ToArray()
                       ));
        }
Пример #2
0
        private int GetTextId(AdSentence sentence)
        {
            if (corpusType == Type.None && !string.IsNullOrEmpty(sentence.Metadata))
            {
                if (sentence.Metadata.StartsWith("LIT"))
                {
                    corpusType = Type.Lit;
                }
                else if (sentence.Metadata.StartsWith("CIE"))
                {
                    corpusType = Type.Cie;
                }
                else
                {
                    corpusType = Type.Ama;
                }
            }

            Match  match;
            string text;

            switch (corpusType)
            {
            case Type.Ama:
                match = AmaMetaRegex.Match(sentence.Metadata);
                if (match.Success)
                {
                    return(int.Parse(match.Groups[1].Value));
                }
                throw new InvalidFormatException("Invalid metadata: " + sentence.Metadata);

            case Type.Cie:
                match = CieMetaRegex.Match(sentence.Metadata);

                if (match.Success)
                {
                    text = match.Groups[1].Value;

                    if (text.Equals(textMeta2))
                    {
                        return(textIdMeta2);
                    }

                    textIdMeta2++;
                    textMeta2 = text;

                    return(textIdMeta2);
                }

                throw new InvalidFormatException("Invalid metadata: " + sentence.Metadata);

            case Type.Lit:
                match = LitMetaRegex.Match(sentence.Metadata);

                if (match.Success)
                {
                    text = match.Groups[1].Value;
                    if (textId == textIdMeta2)
                    {
                        return(textIdMeta2);
                    }

                    textIdMeta2++;
                    textMeta2 = text;

                    return(textIdMeta2);
                }

                throw new InvalidFormatException("Invalid metadata: " + sentence.Metadata);

            default:
                return(0);
            }
        }
Пример #3
0
        /// <summary>
        /// Parses the specified sentence string.
        /// Converts the string representation of a sentence in a specified attributes and culture-specific
        /// format to its <see cref="AdSentence" /> equivalent. A return value indicates whether the
        /// conversion succeeded or failed.
        /// </summary>
        /// <param name="sentence">The sentence.</param>
        /// <param name="sentenceString">The sentence string.</param>
        /// <param name="para">The para.</param>
        /// <param name="isTitle">if set to <c>true</c> [is title].</param>
        /// <param name="isBox">if set to <c>true</c> [is box].</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param>
        /// <param name="monitor">The evaluation monitor. This value can be a <c>null</c> value.</param>
        /// <returns><c>true</c> if the <paramref name="sentenceString"/> parameter was converted successfully, <c>false</c> otherwise.</returns>
        /// <exception cref="System.IO.InvalidDataException">
        /// Something went wrong.
        /// </exception>
        /// <exception cref="System.InvalidOperationException">Should not happen!</exception>
        public static bool TryParse(
            out AdSentence sentence,
            string sentenceString,
            int para,
            bool isTitle,
            bool isBox,
            bool safeParse,
            Monitor monitor)
        {
            string text = null;
            string meta = null;
            var    sent = new AdSentence();

            try {
                using (var reader = new StringReader(sentenceString)) {
                    // first line is <s ...>
                    var line = reader.ReadLine();

                    if (line == null)
                    {
                        sentence = null;
                        return(false);
                    }

                    var useSameTextAndMeta = false; // to handle cases where there are diff sug of parse (&&)

                    while (!line.StartsWith("SOURCE"))
                    {
                        if (line.Equals("&&"))
                        {
                            useSameTextAndMeta = true;
                            break;
                        }
                        line = reader.ReadLine();

                        if (line == null)
                        {
                            sentence = null;
                            return(false);
                        }
                    }

                    if (!useSameTextAndMeta)
                    {
                        var metaFromSource = line.Substring(7);

                        line = reader.ReadLine();

                        if (line == null)
                        {
                            sentence = null;
                            return(false);
                        }

                        var start = line.IndexOf(" ", StringComparison.InvariantCulture);

                        text = FixPunctuation(line.Substring(start + 1).Trim());

                        if (start > 0)
                        {
                            meta = line.Substring(0, start) + " p=" + para;
                            if (isTitle)
                            {
                                meta += " title";
                            }

                            if (isBox)
                            {
                                meta += " box";
                            }

                            meta += metaFromSource;
                        }
                        else
                        {
                            // rare case were there is no space between id and the sentence.

                            if (monitor != null)
                            {
                                monitor.OnWarning("A sentence was skipped due a possible integrity loss.");
                            }

                            // The OpenNLP uses previous meta, but its better to just ignore the sentence
                            // since the previous meta its not related to the current.

                            sentence = null;
                            return(false);
                        }
                    }
                    sent.Text     = text;
                    sent.Metadata = meta;

                    // skip lines starting with ###
                    line = reader.ReadLine();
                    while (line != null && line.StartsWith("###"))
                    {
                        line = reader.ReadLine();
                    }

                    var nodeStack = new List <AdNode>();

                    sent.Root = new AdNode {
                        SyntacticTag = "ROOT",
                        Level        = 0
                    };

                    nodeStack.Add(sent.Root);

                    while (!string.IsNullOrEmpty(line) && !line.StartsWith("</s>") && !line.Equals("&&"))
                    {
                        AdTreeElement element;

                        if (TryParseElement(out element, line, safeParse, monitor))
                        {
                            // The idea here is to keep a stack of nodes that are candidates for
                            // parenting the following elements (nodes and leafs).

                            // 1) When we get a new element, we check its level and remove from
                            // the top of the stack nodes that are brothers or nephews.
                            while (nodeStack.Count != 0 && element.Level > 0 &&
                                   element.Level <= nodeStack[nodeStack.Count - 1].Level)
                            {
                                nodeStack.RemoveAt(nodeStack.Count - 1); // pop
                            }

                            if (element.IsLeaf)
                            {
                                // 2b) There are parent candidates.
                                // look for the node with the correct level

                                if (element.Level == 0)
                                {
                                    nodeStack[0].Elements.Add(element);
                                }
                                else
                                {
                                    var    peek   = nodeStack[nodeStack.Count - 1];
                                    var    index  = nodeStack.Count - 1;
                                    AdNode parent = null;
                                    while (parent == null)
                                    {
                                        if (peek.Level < element.Level)
                                        {
                                            parent = peek;
                                            break;
                                        }
                                        index--;
                                        if (index > -1)
                                        {
                                            peek = nodeStack[index];
                                        }
                                        else
                                        {
                                            parent = nodeStack[0];
                                        }
                                    }
                                    parent.AddElement(element);
                                }
                            }
                            else
                            {
                                // 3) Check if the element that is at the top of the stack is this
                                // node parent, if yes add it as a son

                                if (nodeStack.Count != 0 && nodeStack[nodeStack.Count - 1].Level < element.Level)
                                {
                                    nodeStack[nodeStack.Count - 1].AddElement(element);
                                }
                                else
                                {
                                    throw new InvalidOperationException("Should not happen!");
                                }

                                nodeStack.Add((AdNode)element);
                            }
                        }
                        else if (safeParse)
                        {
                            // invalid element, so we skip this sentence...
                            sentence = null;
                            return(false);
                        }
                        line = reader.ReadLine();
                    }
                }
            } catch (Exception ex) {
                if (monitor != null)
                {
                    monitor.OnException(new InvalidDataException("Something went wrong during the AdSentence parse.", ex));
                }

                sentence = null;
                return(false);
            }

            sentence = sent;
            return(true);
        }
Пример #4
0
        /// <summary>
        /// Parses the specified sentence string.
        /// Converts the string representation of a sentence in a specified attributes and culture-specific
        /// format to its <see cref="AdSentence" /> equivalent. A return value indicates whether the
        /// conversion succeeded or failed.
        /// </summary>
        /// <param name="sentence">The sentence.</param>
        /// <param name="sentenceString">The sentence string.</param>
        /// <param name="para">The para.</param>
        /// <param name="isTitle">if set to <c>true</c> [is title].</param>
        /// <param name="isBox">if set to <c>true</c> [is box].</param>
        /// <param name="safeParse">if set to <c>true</c> the invalid sentences will be ignored.</param>
        /// <param name="monitor">The evaluation monitor. This value can be a <c>null</c> value.</param>
        /// <returns><c>true</c> if the <paramref name="sentenceString"/> parameter was converted successfully, <c>false</c> otherwise.</returns>
        /// <exception cref="System.IO.InvalidDataException">
        /// Something went wrong.
        /// </exception>
        /// <exception cref="System.InvalidOperationException">Should not happen!</exception>
        public static bool TryParse(
            out AdSentence sentence,
            string sentenceString,
            int para,
            bool isTitle,
            bool isBox,
            bool safeParse,
            Monitor monitor) {
            string text = null;
            string meta = null;
            var sent = new AdSentence();

            try {
                using (var reader = new StringReader(sentenceString)) {
                    // first line is <s ...>
                    var line = reader.ReadLine();

                    if (line == null) {
                        sentence = null;
                        return false;
                    }

                    var useSameTextAndMeta = false; // to handle cases where there are diff sug of parse (&&)

                    while (!line.StartsWith("SOURCE")) {
                        if (line.Equals("&&")) {
                            useSameTextAndMeta = true;
                            break;
                        }
                        line = reader.ReadLine();

                        if (line == null) {
                            sentence = null;
                            return false;
                        }
                    }

                    if (!useSameTextAndMeta) {
                        var metaFromSource = line.Substring(7);

                        line = reader.ReadLine();

                        if (line == null) {
                            sentence = null;
                            return false;
                        }

                        var start = line.IndexOf(" ", StringComparison.InvariantCulture);

                        text = FixPunctuation(line.Substring(start + 1).Trim());

                        if (start > 0) {
                            meta = line.Substring(0, start) + " p=" + para;
                            if (isTitle)
                                meta += " title";

                            if (isBox)
                                meta += " box";

                            meta += metaFromSource;
                        } else {
                            // rare case were there is no space between id and the sentence.

                            if (monitor != null)
                                monitor.OnWarning("A sentence was skipped due a possible integrity loss.");

                            // The OpenNLP uses previous meta, but its better to just ignore the sentence
                            // since the previous meta its not related to the current.

                            sentence = null;
                            return false;                           
                        }
                    }
                    sent.Text = text;
                    sent.Metadata = meta;

                    // skip lines starting with ###
                    line = reader.ReadLine();
                    while (line != null && line.StartsWith("###")) {
                        line = reader.ReadLine();
                    }

                    var nodeStack = new List<AdNode>();

                    sent.Root = new AdNode {
                        SyntacticTag = "ROOT",
                        Level = 0
                    };

                    nodeStack.Add(sent.Root);

                    while (!string.IsNullOrEmpty(line) && !line.StartsWith("</s>") && !line.Equals("&&")) {
                        AdTreeElement element;

                        if (TryParseElement(out element, line, safeParse, monitor)) {
                            // The idea here is to keep a stack of nodes that are candidates for
                            // parenting the following elements (nodes and leafs).

                            // 1) When we get a new element, we check its level and remove from
                            // the top of the stack nodes that are brothers or nephews.
                            while (nodeStack.Count != 0 && element.Level > 0 &&
                                   element.Level <= nodeStack[nodeStack.Count - 1].Level) {
                                nodeStack.RemoveAt(nodeStack.Count - 1); // pop
                            }

                            if (element.IsLeaf) {
                                // 2b) There are parent candidates.
                                // look for the node with the correct level

                                if (element.Level == 0) {
                                    nodeStack[0].Elements.Add(element);
                                } else {
                                    var peek = nodeStack[nodeStack.Count - 1];
                                    var index = nodeStack.Count - 1;
                                    AdNode parent = null;
                                    while (parent == null) {
                                        if (peek.Level < element.Level) {
                                            parent = peek;
                                            break;
                                        }
                                        index--;
                                        if (index > -1) {
                                            peek = nodeStack[index];
                                        } else {
                                            parent = nodeStack[0];
                                        }
                                    }
                                    parent.AddElement(element);
                                }
                            } else {
                                // 3) Check if the element that is at the top of the stack is this
                                // node parent, if yes add it as a son

                                if (nodeStack.Count != 0 && nodeStack[nodeStack.Count - 1].Level < element.Level) {
                                    nodeStack[nodeStack.Count - 1].AddElement(element);
                                } else {
                                    throw new InvalidOperationException("Should not happen!");
                                }

                                nodeStack.Add((AdNode) element);
                            }
                        } else if (safeParse) {
                            // invalid element, so we skip this sentence...
                            sentence = null;
                            return false;
                        }
                        line = reader.ReadLine();
                    }
                }
            } catch (Exception ex) {
                if (monitor != null)
                    monitor.OnException(new InvalidDataException("Something went wrong during the AdSentence parse.", ex));

                sentence = null;
                return false;
            }

            sentence = sent;
            return true;
        }