public PageResult Analyze(Request request, Response response) { var doc = new HtmlDocument(); if (string.IsNullOrWhiteSpace(response.Content)) return PageResult.EmptyResponse(Site.Topic, request, response, Channel.Product); doc.LoadHtml(response.Content); var newRequests = FindNewRequest(doc, request, @"[\s\S]*", "/Product/Details/");//todo: regexPattern List<ResultField> resultFields = null; if (request.UrlType == UrlType.Extracting) { resultFields = XpathSelect(doc, _fieldXPaths); resultFields.Add(new ResultField() { Name = "Uri", Value = request.Url }); resultFields.Add(new ResultField() { Name = "SiteName", Value = Site.Name }); resultFields.Add(new ResultField() { Name = "SiteDomain", Value = Site.Domain }); resultFields.Add(new ResultField() { Name = "ElapsedSecond", Value = response.MillisecondTime.ToString() }); resultFields.Add(new ResultField() { Name = "Downloader", Value = response.Downloader }); resultFields.Add(new ResultField() { Name = "CommentCount", Value = "0" }); } var pageResult = new PageResult { Request = request, Response = response, NewRequests = newRequests, Channel = Channel.Product, Data = resultFields, Topic = Site.Topic }; return pageResult; }
public static PageResult EmptyResponse(string topic, Request request, Response response, string channel) { return new PageResult { Topic = topic, Request = request, Response = response, Channel = channel }; }
public void DownloadAsync(IEnumerable<Request> requests, Action<Request, Response> onDownloadComplete, Action onConsumed) { foreach (Request req in requests) { // _currentTaskNumber++ Interlocked.Increment(ref _currentTaskNumber); Task.Run(async () => { try { var client = HttpClientBuilder.GetClient(req.SiteId); Console.WriteLine(@"开始执行Http下载,占位符." + req.Url); var httpResp = await client.GetAsync(req.Url); string content = null; if (httpResp.IsSuccessStatusCode) content = await httpResp.Content.ReadAsStringAsync(); Console.WriteLine(@"下载完成:" + req.Url); var resp = new Response() { Request = req, HttpStatusCode = httpResp.StatusCode, IsSuccessCode = httpResp.IsSuccessStatusCode, ReasonPhrase = httpResp.ReasonPhrase, Content = content }; return new Tuple<Request, Response>(req, resp); } finally { // _currentTaskNumber-- Interlocked.Decrement(ref _currentTaskNumber); } }).ContinueWith(t => { onDownloadComplete(t.Result.Item1, t.Result.Item2); }).ContinueWith(t => onConsumed()); } }
public async Task<Tuple<Request, Response>> DownloadAsync(Request request) { try { Interlocked.Increment(ref _currentTaskNumber); var client = HttpClientBuilder.GetClient(request.SiteId); var httpRespMessage = await client.GetAsync(request.Url); string content = null; Stream stream = null; if (httpRespMessage.IsSuccessStatusCode) { var contentType = httpRespMessage.Content.Headers.ContentType; content = await httpRespMessage.Content.ReadAsStringAsync(); stream = await httpRespMessage.Content.ReadAsStreamAsync(); } var resp = new Response() { HttpStatusCode = httpRespMessage.StatusCode, IsSuccessCode = httpRespMessage.IsSuccessStatusCode, ReasonPhrase = httpRespMessage.ReasonPhrase, Content = content, StreamContent = stream }; return new Tuple<Request, Response>(request, resp); } catch (Exception) { throw; } finally { Interlocked.Decrement(ref _currentTaskNumber); } }