コード例 #1
0
        /// <summary>Extracts text from the HTML code given as a String.</summary>
        /// <remarks>Extracts text from the HTML code given as a String.</remarks>
        /// <param name="html">The HTML code as a String.</param>
        /// <returns>The extracted text.</returns>
        /// <exception cref="NBoilerpipePortable.BoilerpipeProcessingException">NBoilerpipePortable.BoilerpipeProcessingException
        /// 	</exception>
        public virtual string GetText(string html)
        {
            try
            {

                NBoilerpipeHtmlParser parser = new NBoilerpipeHtmlParser(new NBoilerpipeContentHandler());
                parser.Parse(html);
                return GetText(parser.ToTextDocument());
            }
            catch (Exception e)
            {
                throw new BoilerpipeProcessingException(e.ToString());
            }
        }
コード例 #2
0
        public IEnumerable<Tuple<string, string>> GetTextAndImageBlocks(string html, Uri uri, out string title)
        {
            var contentHandler = new NBoilerpipeContentHandler();
            NBoilerpipeHtmlParser parser = new NBoilerpipeHtmlParser(contentHandler);
            parser.Parse(html);
            var doc = parser.ToTextDocument();
            this.Process(doc);
            title = doc.GetTitle();
            List<Tuple<string, string>> result = new List<Tuple<string, string>>();
            foreach (var textblock in doc.GetTextBlocks())
            {
                if (textblock.IsContent())
                {
                    int textOffset = 0;
                    var remainingText = textblock.GetText();
                    foreach (var imageTpl in textblock.NearbyImages)
                    {
                        if (imageTpl.Item1 == 0)
                            result.Add(Tuple.Create("", CleanImageUrl(uri, imageTpl.Item2)));
                        else
                        {
                            var substring = remainingText.Substring(0, (imageTpl.Item1 - textOffset));
                            remainingText = remainingText.Substring(imageTpl.Item1 - textOffset);
                            textOffset = imageTpl.Item1;
                            result.Add(Tuple.Create(substring, ""));
                            result.Add(Tuple.Create("", CleanImageUrl(uri, imageTpl.Item2)));
                        }
                    }

                    if (!string.IsNullOrWhiteSpace(remainingText))
                    {
                        result.Add(Tuple.Create(remainingText, ""));
                    }
                }
            }
            return result;
        }