private async Task<string> GetContent(HttpResponseMessage response, ICaptchaWindow captchaWindow, int pageNumber, CancellationTokenSource cancellationToken) { if (!response.IsSuccessStatusCode) throw new InvalidOperationException(string.Format("Cannot continue google search because it retrieved error '{0}'", response.StatusCode.ToString())); return await response.Content.ReadAsStringAsync(); }
private async Task <string> GetContent(HttpResponseMessage response, ICaptchaWindow captchaWindow, int pageNumber, CancellationTokenSource cancellationToken) { if (!response.IsSuccessStatusCode) { throw new InvalidOperationException(string.Format("Cannot continue google search because it retrieved error '{0}'", response.StatusCode.ToString())); } return(await response.Content.ReadAsStringAsync()); }
public GoogleSearchEngine(int allowedCount, string keywords, ICaptchaWindow captchaWindow) { string[] escapedKeywords = keywords.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Select(item => Uri.EscapeDataString(item)).ToArray(); StringBuilder builder = new StringBuilder(); builder.Append("http://www.google.com.ua/search?q="); builder.Append(string.Join("+", escapedKeywords)); builder.Append("&start={0}"); queryString = builder.ToString(); linkNumber = 0; this.allowedCount = allowedCount; this.captchaWindow = captchaWindow; searchOnPage = null; }
public async Task Initialize(Uri googlePage, ICaptchaWindow captchaWindow, int pageNumber, CancellationTokenSource cancellationToken) { using (HttpClient client = new HttpClient()) using (HttpResponseMessage response = await client.GetAsync(googlePage, cancellationToken.Token)) { pageContent = await GetContent(response, captchaWindow, pageNumber, cancellationToken); } Regex regex = new Regex("<a[^>]*?href\\s*=\\s*(?<url>[\"']?([^\"'>]+?)['\"])?[^>]*?>"); foreach (Match match in regex.Matches(pageContent)) { string url = match.Groups["url"].Value; url = url.Remove(0, 1); url = url.Remove(url.Length - 1, 1); if (!url.StartsWith(urlPrefix)) { continue; } url = url.Substring(urlPrefix.Length); int index = url.IndexOf(urlSuffix); if (index != -1) { url = url.Substring(0, index); } if (!Uri.IsWellFormedUriString(url, UriKind.Absolute)) { continue; } Uri uri = new Uri(url); if (!InException(uri)) { urls.Add(uri); } } }
public async Task Initialize(Uri googlePage, ICaptchaWindow captchaWindow, int pageNumber, CancellationTokenSource cancellationToken) { using (HttpClient client = new HttpClient()) using (HttpResponseMessage response = await client.GetAsync(googlePage, cancellationToken.Token)) { pageContent = await GetContent(response, captchaWindow, pageNumber, cancellationToken); } Regex regex = new Regex("<a[^>]*?href\\s*=\\s*(?<url>[\"']?([^\"'>]+?)['\"])?[^>]*?>"); foreach (Match match in regex.Matches(pageContent)) { string url = match.Groups["url"].Value; url = url.Remove(0, 1); url = url.Remove(url.Length - 1, 1); if (!url.StartsWith(urlPrefix)) continue; url = url.Substring(urlPrefix.Length); int index = url.IndexOf(urlSuffix); if (index != -1) url = url.Substring(0, index); if (!Uri.IsWellFormedUriString(url, UriKind.Absolute)) continue; Uri uri = new Uri(url); if (!InException(uri)) { urls.Add(uri); } } }