Beispiel #1
0
        private void HandleDomainUrl(ParserInput message)
        {
            IndexingData indexingData = null;

            using (var webClient = new WebClient())
            {
                if (message.UseProxy)
                {
                    webClient.Proxy = _proxyProvider.GetProxy();
                }

                var url = _proxyProvider.GetRequestUrl(string.Format(GoogleSearchUrl, message.Url));

                webClient.Headers.Add("Accept-Language", "en-US");

                var pageData = webClient.DownloadString(url);

                var page = new HtmlDocument();
                page.LoadHtml(pageData);

                HtmlNode resultStatsNode = page.GetElementbyId("resultStats");
                if (resultStatsNode == null || resultStatsNode.InnerHtml == string.Empty)
                {
                    indexingData = new IndexingData
                    {
                        PagesNumber    = 0,
                        ProcessingDate = DateTime.Now.Date
                    };

                    var domainStat = new DomainStat {
                        DomainURL = message.Url, IndexingData = indexingData
                    };

                    Sender.Tell(domainStat, Self);
                    return;
                }

                var resultStats = HtmlEntity.DeEntitize(resultStatsNode.InnerHtml);

                var firstNumberIndex = resultStats.IndexOfAny("123456789".ToCharArray());
                var lastNumberIndex  = resultStats.LastIndexOfAny("0123456789".ToCharArray());

                if (firstNumberIndex >= 0 && lastNumberIndex >= 0 && lastNumberIndex >= firstNumberIndex)
                {
                    var match = resultStats.Substring(firstNumberIndex, lastNumberIndex - firstNumberIndex + 1);

                    var numberString = match.Replace(",", string.Empty).Replace(".", string.Empty);
                    numberString = Regex.Replace(numberString, @"\s+", "");

                    var indexedPagesNumber = long.Parse(numberString);

                    indexingData = new IndexingData
                    {
                        PagesNumber    = indexedPagesNumber,
                        ProcessingDate = DateTime.Now.Date
                    };

                    var domainStat = new DomainStat {
                        DomainURL = message.Url, IndexingData = indexingData
                    };

                    Sender.Tell(domainStat, Self);
                }
                else
                {
                    var errorMessage = "Match was not successful! Result stats: " + resultStats;
                    var parsingError = new ParsingError {
                        DomainURL = message.Url, ErrorMessage = errorMessage
                    };
                    Sender.Tell(parsingError, Self);
                }
            }
        }