private async Task<string> GetContent(HttpResponseMessage response, ICaptchaWindow captchaWindow, int pageNumber, CancellationTokenSource cancellationToken)
        {
            if (!response.IsSuccessStatusCode)
                throw new InvalidOperationException(string.Format("Cannot continue google search because it retrieved error '{0}'", response.StatusCode.ToString()));

            return await response.Content.ReadAsStringAsync();
        }
Exemple #2
0
        private async Task <string> GetContent(HttpResponseMessage response, ICaptchaWindow captchaWindow, int pageNumber, CancellationTokenSource cancellationToken)
        {
            if (!response.IsSuccessStatusCode)
            {
                throw new InvalidOperationException(string.Format("Cannot continue google search because it retrieved error '{0}'", response.StatusCode.ToString()));
            }

            return(await response.Content.ReadAsStringAsync());
        }
        public GoogleSearchEngine(int allowedCount, string keywords, ICaptchaWindow captchaWindow)
        {
            string[] escapedKeywords = keywords.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Select(item => Uri.EscapeDataString(item)).ToArray();
            StringBuilder builder = new StringBuilder();

            builder.Append("http://www.google.com.ua/search?q=");
            builder.Append(string.Join("+", escapedKeywords));
            builder.Append("&start={0}");

            queryString = builder.ToString();

            linkNumber = 0;

            this.allowedCount = allowedCount;
            this.captchaWindow = captchaWindow;

            searchOnPage = null;
        }
        public GoogleSearchEngine(int allowedCount, string keywords, ICaptchaWindow captchaWindow)
        {
            string[]      escapedKeywords = keywords.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Select(item => Uri.EscapeDataString(item)).ToArray();
            StringBuilder builder         = new StringBuilder();

            builder.Append("http://www.google.com.ua/search?q=");
            builder.Append(string.Join("+", escapedKeywords));
            builder.Append("&start={0}");

            queryString = builder.ToString();

            linkNumber = 0;

            this.allowedCount  = allowedCount;
            this.captchaWindow = captchaWindow;

            searchOnPage = null;
        }
Exemple #5
0
        public async Task Initialize(Uri googlePage, ICaptchaWindow captchaWindow, int pageNumber, CancellationTokenSource cancellationToken)
        {
            using (HttpClient client = new HttpClient())
                using (HttpResponseMessage response = await client.GetAsync(googlePage, cancellationToken.Token))
                {
                    pageContent = await GetContent(response, captchaWindow, pageNumber, cancellationToken);
                }

            Regex regex = new Regex("<a[^>]*?href\\s*=\\s*(?<url>[\"']?([^\"'>]+?)['\"])?[^>]*?>");

            foreach (Match match in regex.Matches(pageContent))
            {
                string url = match.Groups["url"].Value;
                url = url.Remove(0, 1);
                url = url.Remove(url.Length - 1, 1);

                if (!url.StartsWith(urlPrefix))
                {
                    continue;
                }

                url = url.Substring(urlPrefix.Length);

                int index = url.IndexOf(urlSuffix);

                if (index != -1)
                {
                    url = url.Substring(0, index);
                }

                if (!Uri.IsWellFormedUriString(url, UriKind.Absolute))
                {
                    continue;
                }

                Uri uri = new Uri(url);

                if (!InException(uri))
                {
                    urls.Add(uri);
                }
            }
        }
        public async Task Initialize(Uri googlePage, ICaptchaWindow captchaWindow, int pageNumber, CancellationTokenSource cancellationToken)
        {
            using (HttpClient client = new HttpClient())
            using (HttpResponseMessage response = await client.GetAsync(googlePage, cancellationToken.Token))
            {
                pageContent = await GetContent(response, captchaWindow, pageNumber, cancellationToken);
            }

            Regex regex = new Regex("<a[^>]*?href\\s*=\\s*(?<url>[\"']?([^\"'>]+?)['\"])?[^>]*?>");

            foreach (Match match in regex.Matches(pageContent))
            {
                string url = match.Groups["url"].Value;
                url = url.Remove(0, 1);
                url = url.Remove(url.Length - 1, 1);

                if (!url.StartsWith(urlPrefix))
                    continue;

                url = url.Substring(urlPrefix.Length);

                int index = url.IndexOf(urlSuffix);

                if (index != -1)
                    url = url.Substring(0, index);

                if (!Uri.IsWellFormedUriString(url, UriKind.Absolute))
                    continue;

                Uri uri = new Uri(url);

                if (!InException(uri))
                {
                    urls.Add(uri);
                }
            }
        }