public ActionResult Export(TaskEntity model, bool isOptimize) { var crawlerlists = new CrawlerDomain().Get(t => t.TaskGuid == model.Unique).OrderByDescending(t => t.爬行更新时间).ToList(); if (crawlerlists.Count < 1) { return(Content("<script>alert('未找到内容');</script>")); } var bytes = crawlerlists.ListToExcel(isOptimize); return(File(bytes, "application/vnd.ms-excel", "导出北京企业采集信息[" + DateTime.Now.ToString("yyyy-M-d dddd") + "].xls")); }
public ActionResult SingelSearch(string guid = null, string searchInfo = null) { CrawlerEntity crawlerEntity = null; if (string.IsNullOrWhiteSpace(guid) && string.IsNullOrWhiteSpace(searchInfo)) { ViewBag.Guid = Guid.NewGuid(); } else { ViewBag.Guid = guid; var count = new TargeCompanyDomain().Get(t => t.CompanyName.Equals(searchInfo))?.Count; if (count > 0) { //历史记录中已存在 } else { //上网检索 List <string> companyList = new List <string> { searchInfo }; TaskEntity model = new TaskEntity(); model.TaskType = EnumTaskType.BjCrawler; model.TaskName = $"单个任务[{DateTime.Now.ToString("G")}]"; model.Unique = Conv.ToGuid(guid); model.TaskStateDicId = 1; model.TaskNum = 1; model.CreateTime = DateTime.Now; model.IsSingelSearch = true; new TaskDomain().Add(model); new BaseData(model).InsertMetadata(companyList.ToList(), model.TaskName, model, taskEntity => { Task[] tasks = new Task[4]; for (int i = 0; i < 4; i++) { tasks[i] = new Task(() => { var bjqyxy = new Crawler.Bjqyxy.BjCrawler(taskEntity, t => t.TaskGuid.Equals(taskEntity.Unique)); bjqyxy.SingelSearch(searchInfo); }); tasks[i].Start(); } Task.WaitAny(tasks); }); } crawlerEntity = new CrawlerDomain().Get(t => t.搜索名称 == searchInfo && t.称 != null).FirstOrDefault(); } return(View(crawlerEntity)); }
public static int ToImageResource(this CrawlerDomain domain) { if (domain == CrawlerDomain.Mandarake) { return(Resource.Drawable.mandarake); } if (domain == CrawlerDomain.Surugaya) { return(Resource.Drawable.surugaya); } if (domain == CrawlerDomain.Mercari) { return(Resource.Drawable.mercari); } if (domain == CrawlerDomain.Yahoo) { return(Resource.Drawable.yahoo); } if (domain == CrawlerDomain.Lashinbang) { return(Resource.Drawable.lashinbang); } return(0); }
protected override void OnStart(string[] args) { log.Info("Start service"); try { InitializeComponent(); cancelTokenSource = new CancellationTokenSource(); string rabbitMQServerName = ConfigurationManager.AppSettings["rabbitMQServerName"]; workers = new Worker[workerCount]; rabbitMQServer = RabbitMQManager.GetRabbitMQServer(rabbitMQServerName); string connectToSQL = @"Data Source=172.22.30.86,1455;Initial Catalog=QT_2;Persist Security Info=True;User ID=qt_vn;Password=@F4sJ=l9/ryJt9MT;connection timeout=200"; string connectToConnection = @"Data Source=42.112.28.93;Initial Catalog=QT_2;Persist Security Info=True;User ID=wss_price;Password=HzlRt4$$axzG-*UlpuL2gYDu;connection timeout=200"; CrawlerProductAdapter crawlerProductAdapter = new CrawlerProductAdapter(new SqlDb(connectToSQL)); ProductAdapter productAdapter = new ProductAdapter(new SqlDb(connectToConnection)); for (int i = 0; i < workerCount; i++) { log.InfoFormat("Start worker {i}", i.ToString()); var worker = new Worker(AddProductToSqlJobName, false, rabbitMQServer); workers[i] = worker; var token = this.cancelTokenSource.Token; Task workerTask = new Task(() => { worker.JobHandler = (downloadImageJob) => { try { token.ThrowIfCancellationRequested(); QT.Entities.CrawlerProduct.RabbitMQ.MsSaveProduct Mss = QT.Entities.CrawlerProduct.RabbitMQ.MsSaveProduct.GetDataFromMessage(downloadImageJob.Data); string Url = Mss.Url; string Domain = QT.Entities.Common.GetDomainFromUrl(Url); long CompanyID = QT.Entities.Common.GetIDCompany(Domain); QT.Entities.Configuration config = new QT.Entities.Configuration(CompanyID); if (_company.Status == Common.CompanyStatus.WEB_CRAWLERDOMAIN) { List <QT.Entities.Company> ls = new List <QT.Entities.Company>(); QT.Entities.CrawlerDomain obj = new CrawlerDomain(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(Url.Trim(), 15, 1); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); doc.LoadHtml(html); } else { int numberItemSaved = 0; string[] arLink = Url.Trim().Split(SqlDb.arSplit, StringSplitOptions.RemoveEmptyEntries); foreach (var item in arLink) { QT.Entities.Product _product = new Product(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(item, 45, 2); if (config.ContentAnanyticXPath.Count >= 1) { int i1 = 0, i2 = 0; i1 = html.IndexOf(config.ContentAnanyticXPath[0]); if (i1 >= 0) { html = html.Substring(i1); if (config.ContentAnanyticXPath.Count >= 2) { i2 = html.IndexOf(config.ContentAnanyticXPath[1]); if (i2 >= 0) { html = html.Substring(0, i2 + config.ContentAnanyticXPath[1].Length); } } } html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); html = Common.TidyCleanR(html); } _htmlSource = html; GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); doc.LoadHtml(html); List <Product> lstUpdateProduct = new List <Product>(); List <Product> lstInsertProduct = new List <Product>(); _product.Analytics(doc, item, config, true, _company.Domain); if (_product != null && _product.IsSuccessData(config.CheckPrice)) { numberItemSaved++; if (productAdapter.CheckExistInDb(_product.ID)) { lstUpdateProduct.Add(_product); } else { lstInsertProduct.Add(_product); } productAdapter.UpdateProductsChangeToDb(lstUpdateProduct); productAdapter.InsertListProduct(lstInsertProduct); productAdapter.PushQueueIndexCompany(config.CompanyID); productAdapter.PushQueueChangeChangeImage(new MQChangeImage() { ProductID = _product.ID, Type = 1 }); log.InfoFormat("Saved {0} item product!", _product.Name); } } } return(true); } catch (OperationCanceledException opc) { log.Info("End worker"); return(false); } }; worker.Start(); }, token); workerTask.Start(); log.InfoFormat("Worker {0} started", i); } } catch (Exception ex) { log.Error("Start error", ex); throw; } }
public ICrawler GetCrawler(CrawlerDomain crawlerDomain) { return(_crawlers[crawlerDomain]); }