Пример #1
0
        public Crawler(IWebPageParser parser)
        {
            WebsiteMap   = new ConcurrentDictionary <string, WebPage>();
            LinksToCrawl = new Queue <string>();

            _parser = parser;
        }
Пример #2
0
        public void Setup()
        {
            restClientMock = Substitute.For <IRestClient>();
            webPageParser  = new WebPageParser(restClientMock);

            sut = new Crawler(webPageParser);
        }
Пример #3
0
 public PageCrawlingJob(IUnitOfWork unitOfWork, IHttpClient httpClient, IWebPageParser pageParser,
                        IRepository <FeedItem> feedItems)
 {
     _unitOfWork = unitOfWork;
     _httpClient = httpClient;
     _pageParser = pageParser;
     _feedItems  = feedItems;
 }
 public WebPageLoader(string rootUrl, List <string> extensions = null,
                      bool isVerbose = true, DomainLimit domainLimit = DomainLimit.WithoutLimits)
 {
     _rootUrl             = rootUrl;
     _availableExtensions = extensions;
     _parser           = new WebPageParser();
     _reader           = new WebPageReader();
     _directoryWrapper = new DirectoryWrapper(rootUrl);
     _fileWrapper      = new FileWrapper(rootUrl);
     _isVerbose        = isVerbose;
     _domainLimit      = domainLimit;
 }
Пример #5
0
        public CallResult <IList <ImageInfo> > GetMainImageFromUrl(string url, IWebPageParser pageParser, out long downloadedSize)
        {
            downloadedSize = 0;
            var html = String.Empty;

            try
            {
                /*
                 * <div id="imgTagWrapperId" class="imgTagWrapper" style="height: 801.299px;">
                 *
                 *          <img alt="Sara's Prints Little Girls' Short Sleeve Nightie, Red/Pink Chevron, 2"
                 * src="http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY879_.jpg"
                 * data-old-hires="http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UL1500_.jpg"
                 * class="a-dynamic-image  a-stretch-vertical" id="landingImage"
                 * data-a-dynamic-image="{&quot;http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY606_.jpg&quot;:[405,606],&quot;http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY879_.jpg&quot;:[587,879],&quot;http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY741_.jpg&quot;:[495,741],&quot;http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY445_.jpg&quot;:[297,445],&quot;http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY500_.jpg&quot;:[334,500],&quot;http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY679_.jpg&quot;:[453,679],&quot;http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY550_.jpg&quot;:[367,550]}" style="max-height: 801px; max-width: 587px;">
                 *
                 *      </div>
                 */

                var htmlResult = _htmlScraper.GetHtml(url, ProxyUseTypes.Amazon, (proxy, status, content) =>
                {
                    return(status == HttpStatusCode.OK &&
                           pageParser.ValidateHtml(content));
                });
                if (htmlResult.IsSuccess)
                {
                    html           = htmlResult.Data;
                    downloadedSize = html.Length;

                    return(pageParser.GetLargeImages(html));
                }
                else
                {
                    CallHelper.ThrowIfFail(htmlResult);
                }
            }
            catch (Exception ex)
            {
                _log.Error("Parsing html page issue, url=" + url, ex);
                _log.Info("HTML: " + html);
                return(new CallResult <IList <ImageInfo> >()
                {
                    Status = CallStatus.Fail,
                    Exception = ex,
                });
            }
            return(new CallResult <IList <ImageInfo> >()
            {
                Status = CallStatus.Fail
            });
        }