private void TimeoutCallback(object state, bool timedOut) { if (timedOut) { RequestState rs = state as RequestState; if (rs != null) { rs.Req.Abort(); _log.Error("TimeoutCallback: url={0},HttpStatus={1}.", rs.Url, "Timeout"); UrlInfo urlInfo = new UrlInfo(rs.Url, "TimeoutCallback:TimeOut"); _dbm.write_to_db(urlInfo); _reqsBusy[rs.Index] = false; RequestResource(rs.Index); } } }
private void RequestResource(int index) { var urlAndType = GetUrlAndType(index); if (urlAndType == null) { return; } string url = urlAndType.Item1; UrlType urltype = urlAndType.Item2; try { _log.Info("Request {0} Time:{1}.", url, DateTime.Now.ToString()); HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); req.Method = _method; //请求方法 req.Accept = _accept; //接受的内容 req.CookieContainer = GetCookie(); req.UserAgent = _userAgent; //用户代理 RequestState rs = new RequestState(req, url, urltype, index); var result = req.BeginGetResponse(new AsyncCallback(ReceivedResource), rs); ThreadPool.RegisterWaitForSingleObject(result.AsyncWaitHandle, TimeoutCallback, rs, _maxTime, true); } catch (WebException we) { _log.Error("RequestResource: url={0},HttpStatus={1}, Exception:{2}.", url, we.Status, we.Message); _log.Error(we.StackTrace); UrlInfo urlInfo = new UrlInfo(url, we.Status.ToString()); _dbm.write_to_db(urlInfo); _reqsBusy[index] = false; } if (!_reqsBusy[index]) { RequestResource(index); } }
private void AddUrls(string url, UrlType urlType) { if (urlType >= UrlType.UrlTypeMax) { return; } string cleanUrl = url.Trim(); int end = cleanUrl.IndexOf(' '); if (end > 0) { cleanUrl = cleanUrl.Substring(0, end); } cleanUrl = cleanUrl.TrimEnd('/'); if (UrlAvailable(cleanUrl)) { if (cleanUrl.Contains("book.douban.com/tag") || cleanUrl.Contains("book.douban.com/subject")) { if (urlType == UrlType.OneBookUrl) { _urlsUnloadBooks.Add(cleanUrl, urlType); } else { _urlsUnloadTags.Add(cleanUrl, urlType); } UrlInfo urlInfo = new UrlInfo(cleanUrl, urlType); _dbm.write_to_db(urlInfo); } else { _log.Debug("Try add url failed:{0}.", cleanUrl); //do nothing } } }
public void write_to_db(UrlInfo urlInfo) { lock (_urlLocker) { if (_LoadedWebUrl.Contains(urlInfo._WebUrl)) { _updateUrl_cache.Add(urlInfo); } else { _insertUrl_cache.Add(urlInfo); } if (_insertUrl_cache.Count >= _cache_cnt) { insertWebrlToDb(); } if (_updateUrl_cache.Count >= _cache_cnt) { updateWeburlToDb(); } } }
private void ReceivedData(IAsyncResult ar) { RequestState rs = (RequestState)ar.AsyncState; HttpWebRequest req = rs.Req; Stream resStream = rs.ResStream; string url = rs.Url; UrlType urltype = rs.WebUrlType; string html = null; int index = rs.Index; int read = 0; string HttpStatus; try { read = resStream.EndRead(ar); if (_stop) { rs.ResStream.Close(); req.Abort(); return; } if (read > 0) { MemoryStream ms = new MemoryStream(rs.Data, 0, read); StreamReader reader = new StreamReader(ms, _encoding); string str = reader.ReadToEnd(); rs.Html.Append(str); var result = resStream.BeginRead(rs.Data, 0, rs.BufferSize, new AsyncCallback(ReceivedData), rs); return; } html = rs.Html.ToString(); SgmlReader sgmlRreader = new SgmlReader(); sgmlRreader.DocType = "HTML"; sgmlRreader.InputStream = new StringReader(html); StringWriter sw = new StringWriter(); XmlTextWriter writer = new XmlTextWriter(sw); writer.Formatting = Formatting.Indented; while (sgmlRreader.Read()) { if (sgmlRreader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(sgmlRreader, true); } } SaveContents(sw.ToString(), url, urltype); HttpStatus = WebExceptionStatus.Success.ToString(); } catch (WebException we) { _log.Error("ReceivedData: url = {0}, HttpStatus = {1}, Exception:{2}.", url, we.Status, we.Message); _log.Error(we.StackTrace); HttpStatus = we.Status.ToString(); } catch (Exception e) { _log.Error("ReceivedData: url = {0}, Exception:{1}.", url, e.Message); _log.Error(e.StackTrace); HttpStatus = e.Message; } UrlInfo urlInfo = new UrlInfo(url, HttpStatus); _dbm.write_to_db(urlInfo); if (ContentsSaved != null) { ContentsSaved(HttpStatus, url); } _reqsBusy[index] = false; RequestResource(index); }
private void ReceivedResource(IAsyncResult ar) { RequestState rs = (RequestState)ar.AsyncState; HttpWebRequest req = rs.Req; string url = rs.Url; try { HttpWebResponse res = (HttpWebResponse)req.EndGetResponse(ar); if (_stop) { res.Close(); req.Abort(); return; } if (res != null && res.StatusCode == HttpStatusCode.OK) { Stream resStream = res.GetResponseStream(); rs.ResStream = resStream; var result = resStream.BeginRead(rs.Data, 0, rs.BufferSize, new AsyncCallback(ReceivedData), rs); } else { res.Close(); rs.Req.Abort(); _reqsBusy[rs.Index] = false; } } catch (WebException we) { _log.Error("ReceivedResource: url = {0}, HttpStatus = {1}, Exception:{2}.", url, we.Status, we.Message); _log.Error(we.StackTrace); UrlInfo urlInfo = new UrlInfo(url, we.Status.ToString()); _dbm.write_to_db(urlInfo); if (ContentsSaved != null) { ContentsSaved(we.Status.ToString(), url); } _reqsBusy[rs.Index] = false; } catch (Exception e) { _log.Error("ReceivedResource: url = {0}, Exception:{1}.", url, e.Message); _log.Error(e.StackTrace); UrlInfo urlInfo = new UrlInfo(url, e.Message); _dbm.write_to_db(urlInfo); if (ContentsSaved != null) { ContentsSaved(e.Message, url); } _reqsBusy[rs.Index] = false; } if (!_reqsBusy[rs.Index]) { RequestResource(rs.Index); } }