public LinkSpiderPipeline(string entityName, IScheduler nextSpiderScheduler, ISpider nextSpider, LinkSpiderPrepareStartUrls prepareStartUrls) { NextSpiderScheduler = nextSpiderScheduler; NextSpider = nextSpider; _prepareStartUrls = prepareStartUrls; _entityName = entityName; }
public void Process(List<JObject> datas, ISpider spider) { foreach (var data in datas) { Console.WriteLine(data.ToString()); } }
public void Process(List<JObject> datas, ISpider spider) { lock (this) { _collector.AddRange(datas); } }
public void Process(ResultItems resultItems, ISpider spider) { foreach (var entry in resultItems.Results) { System.Console.WriteLine(entry.Key + ":\t" + entry.Value); } }
protected override void Process(List<ResultItems> resultItemsList, ISpider spider) { if (resultItemsList == null || resultItemsList.Count == 0) { return; } List<JObject> list = new List<JObject>(); foreach (var resultItems in resultItemsList) { dynamic data = resultItems.GetResultItem(_entityName); if (data != null) { if (data is JObject) { list.Add(data); } else { list.AddRange(data); } } } Process(list, spider); }
public void Before() { _resultItems = new ResultItems(); _resultItems.AddOrUpdateResultItem("content", "爬虫工具"); Request request = new Request("http://www.baidu.com", 1, null); _resultItems.Request = request; _spider = new DefaultSpider(); }
public Zap(ISettings settings, ILogger logger, IScanner scanner, IReporting reporting, ISpider spider) { this.settings = settings; this.logger = logger; this.scanner = scanner; this.reporting = reporting; this.spider = spider; }
private void Process(List<JObject> datas, ISpider spider) { _prepareStartUrls.Build(spider.Site, datas); foreach (var startRequest in spider.Site.StartRequests) { NextSpiderScheduler.Push(startRequest); } }
public Page Download(Request request, ISpider spider) { Page page = new Page(request, spider.Site.ContentType); page.Content = File.ReadAllText(request.Url.LocalPath); page.TargetUrl = request.Url.ToString(); page.Url = request.Url.ToString(); page.StatusCode = 200; return page; }
public void Process(ResultItems resultItems, ISpider spider) { foreach (var entry in resultItems.Results) { #if NET_CORE Log.WriteLine(entry.Key + ":\t" + entry.Value); #else System.Console.WriteLine(entry.Key + ":\t" + entry.Value); #endif } }
public void Process(List<JObject> datas, ISpider spider) { foreach (var data in datas) { #if NET_CORE Log.WriteLine(data.ToString()); #else Console.WriteLine(data.ToString()); #endif } }
public static string GetDataFilePath(ISpider spider, string name) { #if !NET_CORE string folderPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "data", spider.Identity); #else string folderPath = Path.Combine(AppContext.BaseDirectory, "data", spider.Identity); #endif if (!Directory.Exists(folderPath)) { Directory.CreateDirectory(folderPath); } return Path.Combine(folderPath, name + ".sql"); }
public void Process(List<JObject> datas, ISpider spider) { List<BsonDocument> reslut = new List<BsonDocument>(); var time = DateTime.Now; foreach (var data in datas) { reslut.Add(new BsonDocument { {"TaskId",_id}, {"Timestamp",time}, {"Data", data.ToString()} }); } _collection.InsertMany(reslut); }
public override void Process(IEnumerable <ResultItems> resultItems, ISpider spider) { var result = new List <AtzucheModel>(); foreach (var resultItem in resultItems) { Console.WriteLine((resultItem.Results["AtzucheList"] as List <AtzucheModel>).Count); foreach (var item in (resultItem.Results["AtzucheList"] as List <AtzucheModel>)) { result.Add(new AtzucheModel() { carNo = item.carNo }); Console.WriteLine($"{item.carNo}:{item.type} "); } } }
public void Process(ResultItems resultItems, ISpider spider) { string path = $"{BasePath}{PathSeperator}{ spider.Identity}{PathSeperator}{Encrypt.Md5Encrypt(resultItems.Request.Url.ToString())}.json"; try { FileInfo file = PrepareFile(path); using (StreamWriter printWriter = new StreamWriter(file.OpenWrite(), Encoding.UTF8)) { printWriter.WriteLine(JsonConvert.SerializeObject(resultItems.Results)); } } catch (IOException e) { spider.Logger.Warn("write file error", e); throw; } }
private void WriteToExcel(IModel model, IEnumerable <dynamic> datas, ISpider spider) { var excelPath = Path.Combine(Env.BaseDirectory, "excels", $"{spider.Name}_{spider.Identity}.xlsx"); var sheetName = model.TableInfo.Name; var sheetIndex = $"{excelPath}.{sheetName}"; if (!_packages.ContainsKey(excelPath)) { _packages.Add(excelPath, new ExcelPackage()); } if (!_rowRecords.ContainsKey(sheetIndex)) { _rowRecords.Add(sheetIndex, 1); } var p = _packages[excelPath]; var sheet = p.Workbook.Worksheets[sheetName]; int row = 1; var columns = model.Fields.ToList(); if (sheet == null) { sheet = p.Workbook.Worksheets.Add(sheetName); for (int i = 1; i < columns.Count + 1; ++i) { sheet.Cells[1, i].Value = columns[i - 1].Name.ToLower(); } row = IncreaseRowIndex(sheetIndex); } foreach (var data in datas) { for (int j = 1; j < columns.Count + 1; ++j) { var column = columns[j - 1].Name; sheet.Cells[row, j].Value = data[column]; } row = IncreaseRowIndex(sheetIndex); } }
/// <summary> /// Concurrent methos, each spider will run inside a thread /// </summary> /// <typeparam name="T"></typeparam> /// <param name="classTypes"></param> /// <returns></returns> public IList <T> StartConcurrent <T>(List <string> classTypes) { List <T> data = new List <T>(); if (disposedValue || CancellationToken.IsCancellationRequested) { return(data); } IList <ISpider> spiders = new List <ISpider>(); foreach (string classType in classTypes) { Type?type = Type.GetType(classType); if (type == null) { continue; } ISpider <T> spider = _spiderFactory.GetSpider <T>(type, CancellationToken); //add the new spider _spiders.Add(spider); spiders.Add(spider); Thread thread = new Thread(new ThreadStart(spider.Go)); _threads.Add(thread); thread.Start(); } foreach (Thread thread in _threads) { thread.Join(); } foreach (ISpider <T> spider in spiders) { _spiders.Remove(spider); } foreach (ISpider <T> spider in spiders) { data.AddRange(spider.ExtractData); spider.Dispose(); } _logger.LogDebug("Spider finished."); return(data); }
public override Request Poll(ISpider spider) { lock (this) { Request poll = _priorityQueuePlus.Pop(); if (poll != null) { return(poll); } poll = _noPriorityQueue.Dequeue(); if (poll != null) { return(poll); } return(_priorityQueueMinus.Pop()); } }
/// <summary> /// 存储数据结果到文件中 /// </summary> /// <param name="resultItems">数据结果</param> /// <param name="spider">爬虫</param> public override void Process(IEnumerable <ResultItems> resultItems, ISpider spider) { try { foreach (var resultItem in resultItems) { resultItem.Request.CountOfResults = 0; resultItem.Request.EffectedRows = 0; string filePath = Path.Combine(GetDataFolder(spider), $"{ Guid.NewGuid():N}.dsd"); using (StreamWriter printWriter = new StreamWriter(File.OpenWrite(filePath), Encoding.UTF8)) { printWriter.WriteLine("url:\t" + resultItem.Request.Url); foreach (var entry in resultItem.Results) { if (entry.Value is IList) { IList value = entry.Value; IList list = value; printWriter.WriteLine(entry.Key + ":"); foreach (var o in list) { printWriter.WriteLine(o); } resultItem.Request.CountOfResults += list.Count; resultItem.Request.EffectedRows += list.Count; } else { printWriter.WriteLine(entry.Key + ":\t" + entry.Value); resultItem.Request.CountOfResults += 1; resultItem.Request.EffectedRows += 1; } } } } } catch { spider.Logger.Error("Write file error."); throw; } }
public virtual void Inject(ISpider spider, bool stopSpider = true) { if (stopSpider) { spider.Pause(() => { spider.Site.Cookies = GetCookies(spider); Logger.MyLog(spider.Identity, "注入 Cookies 成功.", LogLevel.Info); spider.Contiune(); }); } else { spider.Site.Cookies = GetCookies(spider); Logger.MyLog(spider.Identity, "注入 Cookies 成功.", LogLevel.Info); } }
protected override Page DowloadContent(Request request, ISpider spider) { var filePath = request.Uri.AbsoluteUri; if (!string.IsNullOrEmpty(filePath)) { if (File.Exists(filePath)) { return(new Page(request) { Content = File.ReadAllText(filePath) }); } } return(null); }
public override void Handle(ref Page page, ISpider spider) { if (page == null || string.IsNullOrEmpty(Content) || string.IsNullOrEmpty(page.Content)) { return; } if (page != null && !string.IsNullOrEmpty(page.Content) && page.Content.Contains(Content)) { if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { Logger.MyLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error); spider.Exit(); } page = Spider.AddToCycleRetry(page.Request, spider.Site); page.Exception = new DownloadException($"Content downloaded contains string: {Content}."); } }
protected override Task <Page> DowloadContent(Request request, ISpider spider) { var filePath = request.GetExtra("__FilePath"); if (!string.IsNullOrEmpty(filePath)) { if (File.Exists(filePath)) { return(Task.FromResult(new Page(request) { Content = File.ReadAllText(filePath) })); } } return(null); }
internal string GetDataFolder(ISpider spider) { if (_dataFolderCache.ContainsKey(spider.Identity)) { return(_dataFolderCache[spider.Identity]); } else { var dataFolder = Path.Combine(RootDataFolder, spider.Identity); if (!Directory.Exists(dataFolder)) { Directory.CreateDirectory(dataFolder); } _dataFolderCache.TryAdd(spider.Identity, dataFolder); return(dataFolder); } }
/// <summary> /// Searches the current content for all occurrences of a specified regular expression, using the specified matching options. /// </summary> /// <param name="page">页面数据</param> /// <param name="downloader">下载器</param> /// <param name="spider">爬虫</param> public override void Handle(ref Page page, IDownloader downloader, ISpider spider) { if (page == null || string.IsNullOrWhiteSpace(page.Content)) { return; } string textValue = string.Empty; MatchCollection collection = Regex.Matches(page.Content, _pattern, _regexOptions); foreach (Match item in collection) { textValue += item.Value; } page.Content = textValue; }
protected override Cookies GetCookies(ISpider spider) { var path = $"{spider.Identity}.cookies"; if (File.Exists(path)) { var cookie = File.ReadAllText(path); return(new Cookies { StringPart = cookie }); } else { return(new Cookies()); } }
/// <summary> /// 数据序列化成JSON并存储到文件中 /// </summary> /// <param name="resultItems">数据结果</param> /// <param name="spider">爬虫</param> public override void Process(IEnumerable <ResultItems> resultItems, ISpider spider) { try { var jsonFile = Path.Combine(GetDataFolder(spider), $"{spider.Identity}.json"); var streamWriter = GetDataWriter(jsonFile); foreach (var resultItem in resultItems) { streamWriter.WriteLine(JsonConvert.SerializeObject(resultItem.Results)); } } catch (IOException e) { Logger.Log(spider.Identity, "Write data to json file failed.", Level.Error, e); throw; } }
public Page Download(Request request, ISpider spider) { if (spider.Site == null) { return(null); } HandleBeforeDownload(ref request, spider); var page = DowloadContent(request, spider); HandlerAfterDownloadComplete(ref page, spider); TryDetectContentType(page, spider); return(page); }
public override bool Handle(ref Page page, ISpider spider) { if (page != null && !string.IsNullOrEmpty(page.Content)) { return(true); } string textValue = string.Empty; MatchCollection collection = Regex.Matches(page.Content, Pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase); foreach (Match item in collection) { textValue += item.Value; } page.Content = textValue; return(true); }
public override void Handle(ref Page page, IDownloader downloader, ISpider spider) { if (string.IsNullOrEmpty(page?.Content)) { return; } string textValue = string.Empty; MatchCollection collection = Regex.Matches(page.Content, _pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase); foreach (Match item in collection) { textValue += item.Value; } page.Content = textValue; }
/// <summary> /// 执行注入Cookie的操作 /// </summary> /// <param name="spider">需要注入Cookie的爬虫</param> /// <param name="pauseBeforeInject">注入Cookie前是否先暂停爬虫</param> public virtual void Inject(ISpider spider, bool pauseBeforeInject = true) { if (pauseBeforeInject) { spider.Pause(() => { spider.Site.Cookies = GetCookies(spider); Logger.AllLog(spider.Identity, "Inject cookies success.", LogLevel.Info); spider.Contiune(); }); } else { spider.Site.Cookies = GetCookies(spider); Logger.AllLog(spider.Identity, "Inject cookies success.", LogLevel.Info); } }
public virtual bool Handle(Page page, ISpider spider) { if (Stopper != null) { if (!Stopper.NeedStop(page, this)) { page.AddTargetRequests(GenerateRequests(page)); page.MissExtractTargetUrls = true; } } else { page.AddTargetRequests(GenerateRequests(page)); page.MissExtractTargetUrls = true; } return(true); }
/// <summary> /// 执行目标链接解析器 /// </summary> /// <param name="page">页面数据</param> /// <param name="spider">爬虫</param> public override void Handle(ref Page page, ISpider spider) { if (_targetUrlsExtractor == null) { return; } var requests = _targetUrlsExtractor.ExtractRequests(page, spider.Site); foreach (var request in requests) { page.AddTargetRequest(request); } if (!_extractByProcessor) { page.SkipExtractTargetUrls = !page.SkipExtractTargetUrls || page.SkipExtractTargetUrls; } }
public override int Process(string entityName, IEnumerable <dynamic> datas, ISpider spider) { int count = 0; if (EntityAdapters.TryGetValue(entityName, out var metadata)) { string sql = string.Empty; switch (metadata.PipelineMode) { case PipelineMode.Insert: { sql = metadata.InsertSql; break; } case PipelineMode.InsertAndIgnoreDuplicate: { sql = metadata.InsertAndIgnoreDuplicateSql; break; } case PipelineMode.InsertNewAndUpdateOld: { sql = metadata.InsertNewAndUpdateOldSql; break; } case PipelineMode.Update: { sql = metadata.UpdateSql; break; } default: { sql = metadata.InsertSql; break; } } count = ExecuteHttpSql(sql, datas); } return(count); }
public override void Process(IEnumerable <ResultItems> resultItems, ISpider spider) { var results = new List <DefaulHtmlContent>(); foreach (var resultItem in resultItems) { results.Add(new DefaulHtmlContent { Url = resultItem.GetResultItem("url")?.ToString(), Title = resultItem.GetResultItem("title")?.ToString(), Html = resultItem.GetResultItem("html")?.ToString() }); } using (var conn = ConnectionStringSettings.CreateDbConnection()) { conn.MyExecute($"INSERT IGNORE `{Database}`.`{TableName}` (`url`, `title`, `html`) VALUES (@Url, @Title, @Html);", results); } }
/// <summary> /// 存储页面解析器解析到的数据结果到内存中 /// </summary> /// <param name="entityName">爬虫实体类的名称</param> /// <param name="datas">实体类数据</param> /// <param name="spider">爬虫</param> /// <returns>最终影响结果数量(如数据库影响行数)</returns> protected override int Process(IModel model, IEnumerable <dynamic> datas, ISpider spider) { lock (_locker) { if (_collector.ContainsKey(model.Identity)) { var list = _collector[model.Identity]; list.AddRange(datas); } else { var list = new List <dynamic>(); list.AddRange(datas); _collector.Add(model.Identity, list); } return(datas.Count()); } }
public override bool Handle(Page page, ISpider spider) { if (RedialLimit != 0) { lock (this) { ++RequestedCount; if (RedialLimit > 0 && RequestedCount == RedialLimit) { RequestedCount = 0; ((IRedialExecutor)NetworkCenter.Current.Executor).Redial(); } } } return(true); }
public override void Init(ISpider spider) { base.Init(spider); var md5 = Encrypt.Md5Encrypt(spider.Identity); ItemKey += md5; SetKey += md5; QueueKey = md5; ErrorCountKey += md5; SuccessCountKey += md5; IdentityMd5 = md5; NetworkProxyManager.Current.Execute("rds-in", () => { Db.SortedSetAdd(TaskList, spider.Identity, (long)DateTimeUtils.GetCurrentTimeStamp()); }); }
public override void Handle(ref Page page, ISpider spider) { if (!string.IsNullOrEmpty(page?.Content)) { var content = page.Content; var containContent = _contents.FirstOrDefault(c => content.Contains(c)); if (containContent != null) { if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { Logger.MyLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error); spider.Exit(); } page = Spider.AddToCycleRetry(page.Request, spider.Site); page.Exception = new DownloadException($"Content downloaded contains string: {containContent}."); } } }
public void Process(ResultItems resultItems, ISpider spider) { string path = BasePath + PathSeperator + spider.Identity + PathSeperator; try { FileInfo fileInfo = PrepareFile(path + Encrypt.Md5Encrypt(resultItems.Request.Url.ToString()) + ".html"); using (StreamWriter writer = new StreamWriter(fileInfo.OpenWrite(), Encoding.UTF8)) { writer.WriteLine("url:\t" + resultItems.Request.Url); writer.WriteLine("html:\t" + resultItems.GetResultItem("html")); } } catch (IOException e) { LogUtils.GetLogger(spider).Warn("write file error", e); } }
public void Process(ResultItems resultItems, ISpider spider) { string path = $"{BasePath}{PathSeperator}{ spider.Identity}{PathSeperator}{Encrypt.Md5Encrypt(resultItems.Request.Url.ToString())}.json"; try { FileInfo file = PrepareFile(path); using (StreamWriter printWriter = new StreamWriter(file.OpenWrite(), Encoding.UTF8)) { printWriter.WriteLine(JsonConvert.SerializeObject(resultItems.Results)); } } catch (IOException e) { LogUtils.GetLogger(spider).Warn("write file error", e); throw; } }
/// <summary> /// 当页面数据中的异常信息包含指定内容时触发ADSL拨号 /// </summary> /// <param name="page">页面数据</param> /// <param name="spider">爬虫</param> public override void Handle(ref Page page, ISpider spider) { if (page == null || string.IsNullOrEmpty(page.Content) || string.IsNullOrWhiteSpace(page.Content) || page.Exception == null) { return; } if (page.Exception.Message.Contains(_exceptionMessage)) { if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed) { Logger.AllLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error); spider.Exit(); } Spider.AddToCycleRetry(page.Request, spider.Site); page.Exception = new DownloadException("Download failed and redial finished already."); } }
public override Task Process(Item item, ISpider spider) { switch (spider.Name) { case "Simple": case "Movie": case "TMaill": default: Console.WriteLine("-----------------------"); foreach (var n in item.Data) { Console.WriteLine(string.Format("{0}:{1}", n.Key, n.Value)); } Console.WriteLine("-----------------------"); break; } return(base.Process(item, spider)); }
public IEnumerable <Estate> Acquire(ISpider spider) { //<strong class="font14"><span><a href="/c-xisilaigongguan8830/" target="_blank">西斯莱公馆</a> //<p><span>45973</span>元/平米</p> var regex = new Regex("<strong\\s*class=\"font14\".*?<a\\s*href=\"(?<link>.*?)\".*?>(?<name>.*?)<\\/a>(.|\n)*?<p><span>(?<price>\\d+)</span>元/平米", RegexOptions.Compiled); var matches = regex.Matches(content); foreach (var match in matches) { var matchInType = (Match)match; var link = SiteRoots.HomeLink + matchInType.Groups["link"].Value; var name = matchInType.Groups["name"].Value; var priceText = matchInType.Groups["price"].Value; yield return(new Estate { Name = name, Price = TryParseDouble(priceText), Area = GetArea(spider, link) }); } }
public Page Download(Request request, ISpider spider) { // ReSharper disable once UnusedVariable string path = BasePath + "/" + spider.Identity + "/"; Page page; try { FileInfo file = PrepareFile(path + Encrypt.Md5Encrypt(request.Url.ToString())); StreamReader bufferedReader = new StreamReader(file.OpenRead()); string line = bufferedReader.ReadLine(); if (("url:\t" + request.Url).Equals(line)) { string html = GetHtml(bufferedReader); page = new Page(request, spider.Site.ContentType); page.Url = request.Url.ToString(); page.Content = html; } } catch (IOException e) { #if !NET_CORE if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException))) #else if (typeof(FileNotFoundException).GetTypeInfo().IsAssignableFrom(e.GetType().GetTypeInfo())) #endif { spider.Logger.Info("File not exist for url: " + request.Url); } else { spider.Logger.Warn("File read error for url " + request.Url, e); } } page = DownloadWhenMiss(request, spider); return page; }
public void Process(ResultItems resultItems, ISpider spider) { StringBuilder builer = new StringBuilder(BasePath); string filePath = $"{BasePath}{PathSeperator}{spider.Identity}{PathSeperator}{Encrypt.Md5Encrypt(resultItems.Request.Url.ToString())}.fd"; try { FileInfo file = PrepareFile(filePath); using (StreamWriter printWriter = new StreamWriter(file.OpenWrite(), Encoding.UTF8)) { printWriter.WriteLine("url:\t" + resultItems.Request.Url); foreach (var entry in resultItems.Results) { var value = entry.Value as IList; if (value != null) { IList list = value; printWriter.WriteLine(entry.Key + ":"); foreach (var o in list) { printWriter.WriteLine(o); } } else { printWriter.WriteLine(entry.Key + ":\t" + entry.Value); } } } } catch (Exception e) { spider.Logger.Warn("Write file error.", e); throw; } }
public void Process(ResultItems resultItems, ISpider spider) { foreach (var entry in resultItems.Results) { Console.WriteLine($"{entry.Key}:{entry.Value}"); } }
public override Page Download(Request request, ISpider spider) { if (spider.Site == null) { return null; } Site site = spider.Site; var acceptStatCodes = site.AcceptStatCode; //Logger.InfoFormat("Downloading page {0}", request.Url); HttpResponseMessage response = null; var proxy = site.GetHttpProxyFromPool(); request.PutExtra(Request.Proxy, proxy); int statusCode = 200; try { if (GeneratePostBody != null) { SingleExecutor.Execute(() => { GeneratePostBody(spider.Site, request); }); } var httpMessage = GenerateHttpRequestMessage(request, site); response = RedialManagerUtils.Execute("downloader-download", (m) => { var message = (HttpRequestMessage)m; return httpClient.SendAsync(message).Result; }, httpMessage); AddRequestCount(); response.EnsureSuccessStatusCode(); if (!site.AcceptStatCode.Contains(response.StatusCode)) { throw new DownloadException($"下载 {request.Url} 失败. Code: {response.StatusCode}"); } statusCode = (int)response.StatusCode; request.PutExtra(Request.StatusCode, statusCode); Page page = HandleResponse(request, response, statusCode, site); // need update page.TargetUrl = request.Url.ToString(); //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html")); // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度 // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开 ValidatePage(page, spider); // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务 //request.PutExtra(Request.CycleTriedTimes, null); //#if !NET_CORE // httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue; //#endif return page; //正常结果在上面已经Return了, 到此处必然是下载失败的值. //throw new SpiderExceptoin("Download failed."); } catch (RedialException) { throw; } catch (Exception e) { Page page = new Page(request, site.ContentType) { Exception = e }; ValidatePage(page, spider); throw; } finally { // 先Close Response, 避免前面语句异常导致没有关闭. try { //ensure the connection is released back to pool //check: //EntityUtils.consume(httpResponse.getEntity()); response?.Dispose(); } catch (Exception e) { var logger = LogUtils.GetLogger(spider); logger.Warn("Close response fail.", e); } } }
public void Process(List<JObject> datas, ISpider spider) { RedialManagerUtils.Execute("pipeline-", () => { switch (Mode) { case PipelineMode.Insert: { using (var conn = CreateConnection()) { var cmd = conn.CreateCommand(); cmd.CommandText = GetInsertSql(); cmd.CommandType = CommandType.Text; conn.Open(); foreach (var data in datas) { cmd.Parameters.Clear(); List<DbParameter> parameters = new List<DbParameter>(); foreach (var column in Columns) { var parameter = CreateDbParameter(); parameter.ParameterName = $"@{column.Name}"; parameter.Value = data.SelectToken($"{column.Name}")?.Value<string>(); parameter.DbType = Convert(column.DataType); parameters.Add(parameter); } cmd.Parameters.AddRange(parameters.ToArray()); cmd.ExecuteNonQuery(); } conn.Close(); } break; } case PipelineMode.Update: { using (var conn = CreateConnection()) { var cmd = conn.CreateCommand(); cmd.CommandText = GetUpdateSql(); cmd.CommandType = CommandType.Text; conn.Open(); foreach (var data in datas) { cmd.Parameters.Clear(); List<DbParameter> parameters = new List<DbParameter>(); foreach (var column in UpdateColumns) { var parameter = CreateDbParameter(); parameter.ParameterName = $"@{column.Name}"; parameter.Value = data.SelectToken($"{column.Name}")?.Value<string>(); parameter.DbType = Convert(column.DataType); parameters.Add(parameter); } foreach (var column in Primary) { var parameter = CreateDbParameter(); parameter.ParameterName = $"@{column.Name}"; parameter.Value = data.SelectToken($"{column.Name}")?.Value<string>(); parameter.DbType = Convert(column.DataType); parameters.Add(parameter); } cmd.Parameters.AddRange(parameters.ToArray()); cmd.ExecuteNonQuery(); } conn.Close(); } break; } } }); }
public void Clear(ISpider spider) { Redis.KeyDelete(GetQueueKey(spider.Identity)); Redis.KeyDelete(GetSetKey(spider.Identity)); Redis.KeyDelete(GetItemKey(spider.Identity)); }
public Page Download(Request request, ISpider spider) { var page = new Page(request, ContentType.Html); page.Content = ""; return page; }
public int GetTotalRequestsCount(ISpider spider) { return RedialManagerUtils.Execute("rds-gettotalcount", () => { long size = Redis.SetLength(GetSetKey(spider.Identity)); return (int)size; }); }
public int GetLeftRequestsCount(ISpider spider) { return RedialManagerUtils.Execute("rds-getleftcount", () => { long size = Redis.ListLength(GetQueueKey(spider.Identity)); return (int)size; }); }
public override void Init(ISpider spider) { base.Init(spider); var md5 = Encrypt.Md5Encrypt(spider.Identity); ItemKey += md5; SetKey += md5; QueueKey = md5; ErrorCountKey += md5; SuccessCountKey += md5; IdentityMd5 = md5; RedialManagerUtils.Execute("rds-in", () => { Db.SortedSetAdd(TaskList, spider.Identity, (long)DateTimeUtils.GetCurrentTimeStamp()); }); }
public HudsonApiGraber(ISpider spider) { this.spider = spider; }
public void Process(ResultItems resultItems, ISpider spider) { string path = BasePath + PathSeperator + spider.Identity + PathSeperator; try { FileInfo fileInfo = PrepareFile(path + Encrypt.Md5Encrypt(resultItems.Request.Url.ToString()) + ".html"); using (StreamWriter writer = new StreamWriter(fileInfo.OpenWrite(), Encoding.UTF8)) { writer.WriteLine("url:\t" + resultItems.Request.Url); writer.WriteLine("html:\t" + resultItems.GetResultItem("html")); } } catch (IOException e) { spider.Logger.Warn("Write file error.", e); } }
public void Init(ISpider spider) { Spider = spider; }
private Page DownloadWhenMiss(Request request, ISpider spider) { Page page = null; if (_downloaderWhenFileMiss != null) { page = _downloaderWhenFileMiss.Download(request, spider); } return page; }
public virtual void Init(ISpider spider) { Spider = spider; }
public string GetSpiderName(ISpider spider) { return spider.GetType().Name; }