private List <SerpWebPage> Execute(string query, IRuntimeModule <HttpProviderRuntimeConfig, HttpProviderData> httpModule, IRuntimeModule <AntigateTaskConfig, AntigateTaskResult> antigateModule) { if (httpModule == null) { throw new NullReferenceException("Http module"); } if (antigateModule == null) { throw new NullReferenceException("Antigate module"); } List <SerpWebPage> result = new List <SerpWebPage>(); int pageLimit = Configuration.PageLimit.HasValue ? Configuration.PageLimit.Value : 1; YandexHtmlSerpParser parser = new YandexHtmlSerpParser(); HtmlTagPageParser tagParser = new HtmlTagPageParser(); for (int i = 0; i < pageLimit; i++) { var serpList = ExecuteRequest(new YandexQuery(query, i, Configuration.Region), parser, httpModule, antigateModule); result.AddRange(LoadSerpItemDetails(tagParser, httpModule, serpList)); } return(result); }
private List <SerpWebPage> LoadSerpItemDetails(HtmlTagPageParser tagParser, IRuntimeModule <HttpProviderRuntimeConfig, HttpProviderData> httpModule, List <YandexHtmlSerpItem> serpList) { List <SerpWebPage> result = new List <SerpWebPage>(); foreach (var serpItem in serpList) { var targetUrl = serpItem.Href; var title = string.Empty; TryExtractDetaisl(tagParser, httpModule, serpItem.Href, out targetUrl, out title); SerpWebPage webPage = new SerpWebPage(); webPage.Position = serpItem.Postion; webPage.Url = string.IsNullOrEmpty(targetUrl) ? serpItem.Href: targetUrl; webPage.Title = title; result.Add(webPage); } return(result); }
private void TryExtractDetaisl(HtmlTagPageParser tagParser, IRuntimeModule <HttpProviderRuntimeConfig, HttpProviderData> httpModule, string url, out string redirectUrl, out string title) { redirectUrl = string.Empty; title = string.Empty; var httpResult = RuntimeTask.Run(Context, httpModule, new HttpProviderRuntimeConfig() { Query = url }); if (httpResult.IsSuccessfully) { var extractor = new HtmlTextAttributeParser() { Attribute = "content", PostHandle = (value) => { string urlTag = "URL="; int startIndex = value.IndexOf(urlTag); if (startIndex != -1) { value = value.Substring(startIndex + urlTag.Length, value.Length - (urlTag.Length + startIndex)); value = value.Trim('\''); } return(value); } }; var tagConfig = new HtmlTagPageConfig(); tagConfig.Tags.Add("redirecturl", new SinglHtmlNodeSelector() { Tag = new TagProperties() { Path = @"//meta[@http-equiv='refresh']", Extractor = extractor } }); tagConfig.Tags.Add("title", new SinglHtmlNodeSelector() { Tag = new TagProperties() { Path = @"//title", Extractor = new HtmlTagInnerText() } }); using (var stream = httpResult.Data.GetContent()) { tagConfig.Stream = stream; var parseResult = RuntimeTask.Run(Context, tagParser, tagConfig); if (parseResult.IsSuccessfully) { if (parseResult.Data.Values["redirecturl"] != null) { redirectUrl = (string)parseResult.Data.Values["redirecturl"]; } if (parseResult.Data.Values["title"] != null) { title = (string)parseResult.Data.Values["title"]; } } } if (!string.IsNullOrEmpty(redirectUrl)) { httpResult = RuntimeTask.Run(Context, httpModule, new HttpProviderRuntimeConfig() { Query = redirectUrl }); if (httpResult.IsSuccessfully) { using (var stream = httpResult.Data.GetContent()) { tagConfig.Stream = stream; var tagResult = tagParser.Run(tagConfig); if (tagResult.Values["title"] != null) { title = (string)tagResult.Values["title"]; } } } } } }