public PageResult Analyze(Request request, Response response)
        {
            var doc = new HtmlDocument();
            if (string.IsNullOrWhiteSpace(response.Content))
                return PageResult.EmptyResponse(Site.Topic, request, response, Channel.Product);
            doc.LoadHtml(response.Content);
            var newRequests = FindNewRequest(doc, request, @"[\s\S]*", "/Product/Details/");//todo: regexPattern
            List<ResultField> resultFields = null;
            if (request.UrlType == UrlType.Extracting)
            {
                resultFields = XpathSelect(doc, _fieldXPaths);
                resultFields.Add(new ResultField() { Name = "Uri", Value = request.Url });
                resultFields.Add(new ResultField() { Name = "SiteName", Value = Site.Name });
                resultFields.Add(new ResultField() { Name = "SiteDomain", Value = Site.Domain });
                resultFields.Add(new ResultField() { Name = "ElapsedSecond", Value = response.MillisecondTime.ToString() });
                resultFields.Add(new ResultField() { Name = "Downloader", Value = response.Downloader });
                resultFields.Add(new ResultField() { Name = "CommentCount", Value = "0" });
            }

            var pageResult = new PageResult
            {
                Request = request,
                Response = response,
                NewRequests = newRequests,
                Channel = Channel.Product,
                Data = resultFields,
                Topic = Site.Topic
            };
            return pageResult;
        }
Example #2
0
 public static PageResult EmptyResponse(string topic, Request request, Response response, string channel)
 {
     return new PageResult
     {
         Topic = topic,
         Request = request,
         Response = response,
         Channel = channel
     };
 }
        public void DownloadAsync(IEnumerable<Request> requests, Action<Request, Response> onDownloadComplete, Action onConsumed)
        {
            foreach (Request req in requests)
            {
                // _currentTaskNumber++
                Interlocked.Increment(ref _currentTaskNumber); 
                Task.Run(async () =>
                {
                    try
                    {
                        var client = HttpClientBuilder.GetClient(req.SiteId);
                        Console.WriteLine(@"开始执行Http下载,占位符." + req.Url);
                        var httpResp = await client.GetAsync(req.Url);
                        string content = null;
                        if (httpResp.IsSuccessStatusCode)
                            content = await httpResp.Content.ReadAsStringAsync();
                        Console.WriteLine(@"下载完成:" + req.Url);
                        var resp = new Response()
                        {
                            Request = req,
                            HttpStatusCode = httpResp.StatusCode,
                            IsSuccessCode = httpResp.IsSuccessStatusCode,
                            ReasonPhrase = httpResp.ReasonPhrase,
                            Content = content
                        };
                        return new Tuple<Request, Response>(req, resp);
                    }
                    finally
                    {
                        // _currentTaskNumber--
                        Interlocked.Decrement(ref _currentTaskNumber); 
                    }

                }).ContinueWith(t =>
                {
                    onDownloadComplete(t.Result.Item1, t.Result.Item2);
                }).ContinueWith(t => onConsumed());  
            }
        }
        public async Task<Tuple<Request, Response>> DownloadAsync(Request request)
        {
            try
            {
                Interlocked.Increment(ref _currentTaskNumber);
                var client = HttpClientBuilder.GetClient(request.SiteId);
                var httpRespMessage = await client.GetAsync(request.Url);
                string content = null;
                Stream stream = null;
                if (httpRespMessage.IsSuccessStatusCode)
                {
                    var contentType = httpRespMessage.Content.Headers.ContentType;
                    content = await httpRespMessage.Content.ReadAsStringAsync();
                    stream = await httpRespMessage.Content.ReadAsStreamAsync();
                }

                var resp = new Response()
                {
                    HttpStatusCode = httpRespMessage.StatusCode,
                    IsSuccessCode = httpRespMessage.IsSuccessStatusCode,
                    ReasonPhrase = httpRespMessage.ReasonPhrase,
                    Content = content,
                    StreamContent = stream
                };
                return new Tuple<Request, Response>(request, resp);
            }
            catch (Exception)
            {
                throw;
            }
            finally
            {
                Interlocked.Decrement(ref _currentTaskNumber);
            }
        }