public Crawler(IWebPageParser parser) { WebsiteMap = new ConcurrentDictionary <string, WebPage>(); LinksToCrawl = new Queue <string>(); _parser = parser; }
public void Setup() { restClientMock = Substitute.For <IRestClient>(); webPageParser = new WebPageParser(restClientMock); sut = new Crawler(webPageParser); }
public PageCrawlingJob(IUnitOfWork unitOfWork, IHttpClient httpClient, IWebPageParser pageParser, IRepository <FeedItem> feedItems) { _unitOfWork = unitOfWork; _httpClient = httpClient; _pageParser = pageParser; _feedItems = feedItems; }
public WebPageLoader(string rootUrl, List <string> extensions = null, bool isVerbose = true, DomainLimit domainLimit = DomainLimit.WithoutLimits) { _rootUrl = rootUrl; _availableExtensions = extensions; _parser = new WebPageParser(); _reader = new WebPageReader(); _directoryWrapper = new DirectoryWrapper(rootUrl); _fileWrapper = new FileWrapper(rootUrl); _isVerbose = isVerbose; _domainLimit = domainLimit; }
public CallResult <IList <ImageInfo> > GetMainImageFromUrl(string url, IWebPageParser pageParser, out long downloadedSize) { downloadedSize = 0; var html = String.Empty; try { /* * <div id="imgTagWrapperId" class="imgTagWrapper" style="height: 801.299px;"> * * <img alt="Sara's Prints Little Girls' Short Sleeve Nightie, Red/Pink Chevron, 2" * src="http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY879_.jpg" * data-old-hires="http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UL1500_.jpg" * class="a-dynamic-image a-stretch-vertical" id="landingImage" * data-a-dynamic-image="{"http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY606_.jpg":[405,606],"http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY879_.jpg":[587,879],"http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY741_.jpg":[495,741],"http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY445_.jpg":[297,445],"http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY500_.jpg":[334,500],"http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY679_.jpg":[453,679],"http://ecx.images-amazon.com/images/I/81YDdI2Rk0L._UY550_.jpg":[367,550]}" style="max-height: 801px; max-width: 587px;"> * * </div> */ var htmlResult = _htmlScraper.GetHtml(url, ProxyUseTypes.Amazon, (proxy, status, content) => { return(status == HttpStatusCode.OK && pageParser.ValidateHtml(content)); }); if (htmlResult.IsSuccess) { html = htmlResult.Data; downloadedSize = html.Length; return(pageParser.GetLargeImages(html)); } else { CallHelper.ThrowIfFail(htmlResult); } } catch (Exception ex) { _log.Error("Parsing html page issue, url=" + url, ex); _log.Info("HTML: " + html); return(new CallResult <IList <ImageInfo> >() { Status = CallStatus.Fail, Exception = ex, }); } return(new CallResult <IList <ImageInfo> >() { Status = CallStatus.Fail }); }