コード例 #1
0
        private List <SerpWebPage> Execute(string query, IRuntimeModule <HttpProviderRuntimeConfig, HttpProviderData> httpModule, IRuntimeModule <AntigateTaskConfig, AntigateTaskResult> antigateModule)
        {
            if (httpModule == null)
            {
                throw new NullReferenceException("Http module");
            }

            if (antigateModule == null)
            {
                throw new NullReferenceException("Antigate module");
            }
            List <SerpWebPage> result = new List <SerpWebPage>();
            int pageLimit             = Configuration.PageLimit.HasValue ? Configuration.PageLimit.Value : 1;

            YandexHtmlSerpParser parser = new YandexHtmlSerpParser();

            HtmlTagPageParser tagParser = new HtmlTagPageParser();

            for (int i = 0; i < pageLimit; i++)
            {
                var serpList = ExecuteRequest(new YandexQuery(query, i, Configuration.Region), parser, httpModule, antigateModule);
                result.AddRange(LoadSerpItemDetails(tagParser, httpModule, serpList));
            }

            return(result);
        }
コード例 #2
0
        private List <SerpWebPage> LoadSerpItemDetails(HtmlTagPageParser tagParser, IRuntimeModule <HttpProviderRuntimeConfig, HttpProviderData> httpModule, List <YandexHtmlSerpItem> serpList)
        {
            List <SerpWebPage> result = new List <SerpWebPage>();

            foreach (var serpItem in serpList)
            {
                var targetUrl = serpItem.Href;
                var title     = string.Empty;

                TryExtractDetaisl(tagParser, httpModule, serpItem.Href, out targetUrl, out title);

                SerpWebPage webPage = new SerpWebPage();
                webPage.Position = serpItem.Postion;

                webPage.Url   = string.IsNullOrEmpty(targetUrl) ? serpItem.Href: targetUrl;
                webPage.Title = title;
                result.Add(webPage);
            }

            return(result);
        }
コード例 #3
0
        private void TryExtractDetaisl(HtmlTagPageParser tagParser, IRuntimeModule <HttpProviderRuntimeConfig, HttpProviderData> httpModule, string url, out string redirectUrl, out string title)
        {
            redirectUrl = string.Empty;
            title       = string.Empty;

            var httpResult = RuntimeTask.Run(Context, httpModule, new HttpProviderRuntimeConfig()
            {
                Query = url
            });

            if (httpResult.IsSuccessfully)
            {
                var extractor = new HtmlTextAttributeParser()
                {
                    Attribute  = "content",
                    PostHandle = (value) =>
                    {
                        string urlTag     = "URL=";
                        int    startIndex = value.IndexOf(urlTag);
                        if (startIndex != -1)
                        {
                            value = value.Substring(startIndex + urlTag.Length, value.Length - (urlTag.Length + startIndex));
                            value = value.Trim('\'');
                        }
                        return(value);
                    }
                };

                var tagConfig = new HtmlTagPageConfig();
                tagConfig.Tags.Add("redirecturl", new SinglHtmlNodeSelector()
                {
                    Tag = new TagProperties()
                    {
                        Path = @"//meta[@http-equiv='refresh']", Extractor = extractor
                    }
                });
                tagConfig.Tags.Add("title", new SinglHtmlNodeSelector()
                {
                    Tag = new TagProperties()
                    {
                        Path = @"//title", Extractor = new HtmlTagInnerText()
                    }
                });

                using (var stream = httpResult.Data.GetContent())
                {
                    tagConfig.Stream = stream;

                    var parseResult = RuntimeTask.Run(Context, tagParser, tagConfig);
                    if (parseResult.IsSuccessfully)
                    {
                        if (parseResult.Data.Values["redirecturl"] != null)
                        {
                            redirectUrl = (string)parseResult.Data.Values["redirecturl"];
                        }

                        if (parseResult.Data.Values["title"] != null)
                        {
                            title = (string)parseResult.Data.Values["title"];
                        }
                    }
                }

                if (!string.IsNullOrEmpty(redirectUrl))
                {
                    httpResult = RuntimeTask.Run(Context, httpModule, new HttpProviderRuntimeConfig()
                    {
                        Query = redirectUrl
                    });
                    if (httpResult.IsSuccessfully)
                    {
                        using (var stream = httpResult.Data.GetContent())
                        {
                            tagConfig.Stream = stream;
                            var tagResult = tagParser.Run(tagConfig);

                            if (tagResult.Values["title"] != null)
                            {
                                title = (string)tagResult.Values["title"];
                            }
                        }
                    }
                }
            }
        }