private void Crawl() { try { LoadQueue(); while (true) { Token.ThrowIfCancellationRequested(); if (_linksQueue.Count == 0) { _typeEnd = TypeEnd.Success; break; } else if ((DateTime.Now - _timeStart).TotalHours > MaxHourReload) { _typeEnd = TypeEnd.OverTime; break; } else { var job = _linksQueue.Dequeue(); int statusProcess = ProcessJob(job); string strLog = string.Format("ss: {0} cQ: {1} tP: {2} cV: {3} pt: {4} {5} sst: {6} cmp: {7} {8}", _session, _linksQueue.Count, _company.TotalProduct, _countVisited, job.ProductId, job.url, statusProcess, _companyId, (this._company == null) ? "" : this._company.Domain); if (EventReportRun != null) { EventReportRun(strLog); } _log.Info(strLog); if (_linksQueue.Count == 0) { LoadQueue(); } } } } catch (OperationCanceledException) { _typeEnd = TypeEnd.Immediate; _producerPushCompanyReload.PublishString(_companyId.ToString(), true, 0); End(); throw; } catch (Exception ex) { _typeEnd = TypeEnd.Error; _log.Error(ex); } End(); }
private bool CheckEnd() { if (_tokenCrawler.IsCancellationRequested) { _typeEnd = TypeEnd.Immediate; return(true); } else if (_linkQueue.Count == 0) { _typeEnd = TypeEnd.Success; return(true); } else if ((DateTime.Now - _timeStart).Hours >= _config.MaxHourFindNew) { _typeEnd = TypeEnd.OverTime; return(true); } else { return(false); } }
public void StartCrawler() { try { UpdateLastStart(); if (Init()) { RunReportRunning(); AddRootQueue(); _log.Info(GetPrefixLog()); while (!CheckEnd()) { var jobCrawl = _linkQueue.Dequeue(); string strLog = string.Format(GetPrefixLog() + string.Format(" Url: {0} Deep: {1}", jobCrawl.Url, jobCrawl.Deep)); _log.Info(strLog); if (EventReportRun != null) { EventReportRun(strLog); } DelayCrawler(); if (!IsNoVisitUrl(jobCrawl.Url) && (_crcProductOldGroup.Count + _countNewProduct < _limitProductValid)) { _countVisited++; _producerVisitedLinkFindNew.PublishString( Newtonsoft.Json.JsonConvert.SerializeObject(new VisitedLinkFindNew() { CompanyId = _companyId, ProductId = jobCrawl.Id, Url = jobCrawl.Url, Session = _session, LastUpdate = DateTime.Now }), false, 300); var html = GetHtmlCode(jobCrawl.Url, _config.UseClearHtml); if (html != "") { ProcessLink(jobCrawl, html); } } } End(); } } catch (OperationCanceledException oce) { _typeEnd = TypeEnd.Immediate; _log.Info("Push job back queue"); End(); throw; } catch (Exception ex01) { _log.Error(ex01); _producerReportError.PublishString( Newtonsoft.Json.JsonConvert.SerializeObject(new ErrorCrawler() { CompanyId = _companyId, ProductId = 0, TimeError = DateTime.Now, Message = ex01.Message + "\n" + ex01.StackTrace, Url = "" }), true, 0); } }