예제 #1
0
        public static PageItem ProcessWebPage(String pageURL, String cssSelector)
        {
            PageItem pageItem = null;

            try
            {
                // Setup the configuration to support document loading
                var config = Configuration.Default.WithDefaultLoader();
                // Load the names of all The Big Bang Theory episodes from Wikipedia
                using (GZipWebClient client = new GZipWebClient())
                {
                    client.Headers.Add("user-agent", Analyzer.Common.Configuration.ConfigurationManager.AppSettings.UserAgentOptions.GetRandom());
                    string webLocationContent = client.DownloadString(pageURL);
                    var    parser             = new HtmlParser();
                    var    document           = parser.Parse(webLocationContent);
                    // This CSS selector gets the desired content
                    var cellSelector = cssSelector;
                    // Perform the query to get all cells with the content
                    var cells = document.QuerySelectorAll(cellSelector);
                    // We are only interested in the text - select it with LINQ
                    var pageContent = cells.Select(m => m.TextContent);
                    if (pageContent != null && pageContent.Count() > 0)
                    {
                        pageItem = new PageItem();
                        StringBuilder sb = new StringBuilder();
                        foreach (var pageContentElement in pageContent)
                        {
                            sb.AppendLine(pageContentElement);
                        }

                        pageItem.Content = Analyzer.Common.HtmlRemoval.StripTagsCharArray(sb.ToString());
                        pageItem.Url     = pageURL;
                    }
                }
            }
            catch (Exception ex)
            {
                Analyzer.Common.Logger.ExceptionLoggingService.Instance.WriteError("Error in processing given wen URL: " + pageURL, ex);
                pageItem = null;
            }


            return(pageItem);
        }
예제 #2
0
        public static PageItem ProcessWordpressArticle(String pageURL)
        {
            PageItem pageItem = null;

            try
            {
                // Setup the configuration to support document loading
                var config = Configuration.Default.WithDefaultLoader();
                // Load the names of all The Big Bang Theory episodes from Wikipedia
                var address = pageURL;

                using (GZipWebClient client = new GZipWebClient())
                {
                    client.Headers.Add("user-agent", Analyzer.Common.Configuration.ConfigurationManager.AppSettings.UserAgentOptions.GetRandom());
                    string webLocationContent = client.DownloadString(pageURL);
                    var    parser             = new HtmlParser();
                    var    document           = parser.Parse(webLocationContent);
                    // Asynchronously get the document in a new context using the configuration
                    //var document = await BrowsingContext.New(config).OpenAsync(address);
                    var article = document.All.SingleOrDefault(o => o.LocalName == "article");
                    var title   = article.QuerySelectorAll("h1.entry-title");
                    var content = article.QuerySelectorAll("div.entry-content");



                    {
                        pageItem = new PageItem();


                        pageItem.Title   = title.First().TextContent;
                        pageItem.Content = Analyzer.Common.HtmlRemoval.StripTagsCharArray(content.First().TextContent);
                        pageItem.Url     = pageURL;
                    }
                }
            } catch (Exception ex)
            {
                Analyzer.Common.Logger.ExceptionLoggingService.Instance.WriteError("Error in processing given wen URL: " + pageURL, ex);
                pageItem = null;
            }


            return(pageItem);
        }