public LinkSpiderPipeline(string entityName, IScheduler nextSpiderScheduler, ISpider nextSpider, LinkSpiderPrepareStartUrls prepareStartUrls)
 {
     NextSpiderScheduler = nextSpiderScheduler;
     NextSpider = nextSpider;
     _prepareStartUrls = prepareStartUrls;
     _entityName = entityName;
 }
 public void Process(List<JObject> datas, ISpider spider)
 {
     foreach (var data in datas)
     {
         Console.WriteLine(data.ToString());
     }
 }
 public void Process(List<JObject> datas, ISpider spider)
 {
     lock (this)
     {
         _collector.AddRange(datas);
     }
 }
 public void Process(ResultItems resultItems, ISpider spider)
 {
     foreach (var entry in resultItems.Results)
     {
         System.Console.WriteLine(entry.Key + ":\t" + entry.Value);
     }
 }
        protected override void Process(List<ResultItems> resultItemsList, ISpider spider)
        {
            if (resultItemsList == null || resultItemsList.Count == 0)
            {
                return;
            }

            List<JObject> list = new List<JObject>();
            foreach (var resultItems in resultItemsList)
            {
                dynamic data = resultItems.GetResultItem(_entityName);

                if (data != null)
                {
                    if (data is JObject)
                    {
                        list.Add(data);
                    }
                    else
                    {
                        list.AddRange(data);
                    }
                }
            }
            Process(list, spider);
        }
 public void Before()
 {
     _resultItems = new ResultItems();
     _resultItems.AddOrUpdateResultItem("content", "爬虫工具");
     Request request = new Request("http://www.baidu.com", 1, null);
     _resultItems.Request = request;
     _spider = new DefaultSpider();
 }
Beispiel #7
0
 public Zap(ISettings settings, ILogger logger, IScanner scanner, IReporting reporting, ISpider spider)
 {
     this.settings = settings;
     this.logger = logger;
     this.scanner = scanner;
     this.reporting = reporting;
     this.spider = spider;
 }
        private void Process(List<JObject> datas, ISpider spider)
        {
            _prepareStartUrls.Build(spider.Site, datas);

            foreach (var startRequest in spider.Site.StartRequests)
            {
                NextSpiderScheduler.Push(startRequest);
            }
        }
        public Page Download(Request request, ISpider spider)
        {
            Page page = new Page(request, spider.Site.ContentType);
            page.Content = File.ReadAllText(request.Url.LocalPath);
            page.TargetUrl = request.Url.ToString();
            page.Url = request.Url.ToString();
            page.StatusCode = 200;

            return page;
        }
 public void Process(ResultItems resultItems, ISpider spider)
 {
     foreach (var entry in resultItems.Results)
     {
     #if NET_CORE
         Log.WriteLine(entry.Key + ":\t" + entry.Value);
     #else
         System.Console.WriteLine(entry.Key + ":\t" + entry.Value);
     #endif
     }
 }
        public void Process(List<JObject> datas, ISpider spider)
        {
            foreach (var data in datas)
            {
            #if NET_CORE
                Log.WriteLine(data.ToString());
            #else
                Console.WriteLine(data.ToString());
            #endif

            }
        }
Beispiel #12
0
        public static string GetDataFilePath(ISpider spider, string name)
        {
            #if !NET_CORE
            string folderPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "data", spider.Identity);
            #else
            string folderPath = Path.Combine(AppContext.BaseDirectory, "data", spider.Identity);
            #endif
            if (!Directory.Exists(folderPath))
            {
                Directory.CreateDirectory(folderPath);
            }

            return Path.Combine(folderPath, name + ".sql");
        }
 public void Process(List<JObject> datas, ISpider spider)
 {
     List<BsonDocument> reslut = new List<BsonDocument>();
     var time = DateTime.Now;
     foreach (var data in datas)
     {
         reslut.Add(new BsonDocument
         {
             {"TaskId",_id},
             {"Timestamp",time},
             {"Data", data.ToString()}
         });
     }
     _collection.InsertMany(reslut);
 }
Beispiel #14
0
        public override void Process(IEnumerable <ResultItems> resultItems, ISpider spider)
        {
            var result = new List <AtzucheModel>();

            foreach (var resultItem in resultItems)
            {
                Console.WriteLine((resultItem.Results["AtzucheList"] as List <AtzucheModel>).Count);
                foreach (var item in (resultItem.Results["AtzucheList"] as List <AtzucheModel>))
                {
                    result.Add(new AtzucheModel()
                    {
                        carNo = item.carNo
                    });
                    Console.WriteLine($"{item.carNo}:{item.type} ");
                }
            }
        }
 public void Process(ResultItems resultItems, ISpider spider)
 {
     string path = $"{BasePath}{PathSeperator}{ spider.Identity}{PathSeperator}{Encrypt.Md5Encrypt(resultItems.Request.Url.ToString())}.json";
     try
     {
         FileInfo file = PrepareFile(path);
         using (StreamWriter printWriter = new StreamWriter(file.OpenWrite(), Encoding.UTF8))
         {
             printWriter.WriteLine(JsonConvert.SerializeObject(resultItems.Results));
         }
     }
     catch (IOException e)
     {
         spider.Logger.Warn("write file error", e);
         throw;
     }
 }
Beispiel #16
0
        private void WriteToExcel(IModel model, IEnumerable <dynamic> datas, ISpider spider)
        {
            var excelPath  = Path.Combine(Env.BaseDirectory, "excels", $"{spider.Name}_{spider.Identity}.xlsx");
            var sheetName  = model.TableInfo.Name;
            var sheetIndex = $"{excelPath}.{sheetName}";

            if (!_packages.ContainsKey(excelPath))
            {
                _packages.Add(excelPath, new ExcelPackage());
            }

            if (!_rowRecords.ContainsKey(sheetIndex))
            {
                _rowRecords.Add(sheetIndex, 1);
            }

            var p = _packages[excelPath];

            var sheet = p.Workbook.Worksheets[sheetName];

            int row     = 1;
            var columns = model.Fields.ToList();

            if (sheet == null)
            {
                sheet = p.Workbook.Worksheets.Add(sheetName);

                for (int i = 1; i < columns.Count + 1; ++i)
                {
                    sheet.Cells[1, i].Value = columns[i - 1].Name.ToLower();
                }

                row = IncreaseRowIndex(sheetIndex);
            }

            foreach (var data in datas)
            {
                for (int j = 1; j < columns.Count + 1; ++j)
                {
                    var column = columns[j - 1].Name;
                    sheet.Cells[row, j].Value = data[column];
                }

                row = IncreaseRowIndex(sheetIndex);
            }
        }
Beispiel #17
0
        /// <summary>
        /// Concurrent methos, each spider will run inside a thread
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="classTypes"></param>
        /// <returns></returns>
        public IList <T> StartConcurrent <T>(List <string> classTypes)
        {
            List <T> data = new List <T>();

            if (disposedValue || CancellationToken.IsCancellationRequested)
            {
                return(data);
            }
            IList <ISpider> spiders = new List <ISpider>();

            foreach (string classType in classTypes)
            {
                Type?type = Type.GetType(classType);
                if (type == null)
                {
                    continue;
                }
                ISpider <T> spider = _spiderFactory.GetSpider <T>(type, CancellationToken);

                //add the new spider
                _spiders.Add(spider);
                spiders.Add(spider);
                Thread thread = new Thread(new ThreadStart(spider.Go));
                _threads.Add(thread);
                thread.Start();
            }

            foreach (Thread thread in _threads)
            {
                thread.Join();
            }

            foreach (ISpider <T> spider in spiders)
            {
                _spiders.Remove(spider);
            }

            foreach (ISpider <T> spider in spiders)
            {
                data.AddRange(spider.ExtractData);
                spider.Dispose();
            }

            _logger.LogDebug("Spider finished.");
            return(data);
        }
Beispiel #18
0
 public override Request Poll(ISpider spider)
 {
     lock (this)
     {
         Request poll = _priorityQueuePlus.Pop();
         if (poll != null)
         {
             return(poll);
         }
         poll = _noPriorityQueue.Dequeue();
         if (poll != null)
         {
             return(poll);
         }
         return(_priorityQueueMinus.Pop());
     }
 }
Beispiel #19
0
        /// <summary>
        /// 存储数据结果到文件中
        /// </summary>
        /// <param name="resultItems">数据结果</param>
        /// <param name="spider">爬虫</param>
        public override void Process(IEnumerable <ResultItems> resultItems, ISpider spider)
        {
            try
            {
                foreach (var resultItem in resultItems)
                {
                    resultItem.Request.CountOfResults = 0;
                    resultItem.Request.EffectedRows   = 0;

                    string filePath = Path.Combine(GetDataFolder(spider), $"{ Guid.NewGuid():N}.dsd");
                    using (StreamWriter printWriter = new StreamWriter(File.OpenWrite(filePath), Encoding.UTF8))
                    {
                        printWriter.WriteLine("url:\t" + resultItem.Request.Url);

                        foreach (var entry in resultItem.Results)
                        {
                            if (entry.Value is IList)
                            {
                                IList value = entry.Value;
                                IList list  = value;
                                printWriter.WriteLine(entry.Key + ":");
                                foreach (var o in list)
                                {
                                    printWriter.WriteLine(o);
                                }

                                resultItem.Request.CountOfResults += list.Count;
                                resultItem.Request.EffectedRows   += list.Count;
                            }
                            else
                            {
                                printWriter.WriteLine(entry.Key + ":\t" + entry.Value);

                                resultItem.Request.CountOfResults += 1;
                                resultItem.Request.EffectedRows   += 1;
                            }
                        }
                    }
                }
            }
            catch
            {
                spider.Logger.Error("Write file error.");
                throw;
            }
        }
 public virtual void Inject(ISpider spider, bool stopSpider = true)
 {
     if (stopSpider)
     {
         spider.Pause(() =>
         {
             spider.Site.Cookies = GetCookies(spider);
             Logger.MyLog(spider.Identity, "注入 Cookies 成功.", LogLevel.Info);
             spider.Contiune();
         });
     }
     else
     {
         spider.Site.Cookies = GetCookies(spider);
         Logger.MyLog(spider.Identity, "注入 Cookies 成功.", LogLevel.Info);
     }
 }
        protected override Page DowloadContent(Request request, ISpider spider)
        {
            var filePath = request.Uri.AbsoluteUri;

            if (!string.IsNullOrEmpty(filePath))
            {
                if (File.Exists(filePath))
                {
                    return(new Page(request)
                    {
                        Content = File.ReadAllText(filePath)
                    });
                }
            }

            return(null);
        }
 public override void Handle(ref Page page, ISpider spider)
 {
     if (page == null || string.IsNullOrEmpty(Content) || string.IsNullOrEmpty(page.Content))
     {
         return;
     }
     if (page != null && !string.IsNullOrEmpty(page.Content) && page.Content.Contains(Content))
     {
         if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
         {
             Logger.MyLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error);
             spider.Exit();
         }
         page           = Spider.AddToCycleRetry(page.Request, spider.Site);
         page.Exception = new DownloadException($"Content downloaded contains string: {Content}.");
     }
 }
        protected override Task <Page> DowloadContent(Request request, ISpider spider)
        {
            var filePath = request.GetExtra("__FilePath");

            if (!string.IsNullOrEmpty(filePath))
            {
                if (File.Exists(filePath))
                {
                    return(Task.FromResult(new Page(request)
                    {
                        Content = File.ReadAllText(filePath)
                    }));
                }
            }

            return(null);
        }
Beispiel #24
0
 internal string GetDataFolder(ISpider spider)
 {
     if (_dataFolderCache.ContainsKey(spider.Identity))
     {
         return(_dataFolderCache[spider.Identity]);
     }
     else
     {
         var dataFolder = Path.Combine(RootDataFolder, spider.Identity);
         if (!Directory.Exists(dataFolder))
         {
             Directory.CreateDirectory(dataFolder);
         }
         _dataFolderCache.TryAdd(spider.Identity, dataFolder);
         return(dataFolder);
     }
 }
Beispiel #25
0
        /// <summary>
        /// Searches the current content for all occurrences of a specified regular expression, using the specified matching options.
        /// </summary>
        /// <param name="page">页面数据</param>
        /// <param name="downloader">下载器</param>
        /// <param name="spider">爬虫</param>
        public override void Handle(ref Page page, IDownloader downloader, ISpider spider)
        {
            if (page == null || string.IsNullOrWhiteSpace(page.Content))
            {
                return;
            }

            string          textValue  = string.Empty;
            MatchCollection collection = Regex.Matches(page.Content, _pattern, _regexOptions);

            foreach (Match item in collection)
            {
                textValue += item.Value;
            }

            page.Content = textValue;
        }
Beispiel #26
0
        protected override Cookies GetCookies(ISpider spider)
        {
            var path = $"{spider.Identity}.cookies";

            if (File.Exists(path))
            {
                var cookie = File.ReadAllText(path);
                return(new Cookies
                {
                    StringPart = cookie
                });
            }
            else
            {
                return(new Cookies());
            }
        }
 /// <summary>
 /// 数据序列化成JSON并存储到文件中
 /// </summary>
 /// <param name="resultItems">数据结果</param>
 /// <param name="spider">爬虫</param>
 public override void Process(IEnumerable <ResultItems> resultItems, ISpider spider)
 {
     try
     {
         var jsonFile     = Path.Combine(GetDataFolder(spider), $"{spider.Identity}.json");
         var streamWriter = GetDataWriter(jsonFile);
         foreach (var resultItem in resultItems)
         {
             streamWriter.WriteLine(JsonConvert.SerializeObject(resultItem.Results));
         }
     }
     catch (IOException e)
     {
         Logger.Log(spider.Identity, "Write data to json file failed.", Level.Error, e);
         throw;
     }
 }
        public Page Download(Request request, ISpider spider)
        {
            if (spider.Site == null)
            {
                return(null);
            }

            HandleBeforeDownload(ref request, spider);

            var page = DowloadContent(request, spider);

            HandlerAfterDownloadComplete(ref page, spider);

            TryDetectContentType(page, spider);

            return(page);
        }
Beispiel #29
0
        public override bool Handle(ref Page page, ISpider spider)
        {
            if (page != null && !string.IsNullOrEmpty(page.Content))
            {
                return(true);
            }

            string          textValue  = string.Empty;
            MatchCollection collection = Regex.Matches(page.Content, Pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);

            foreach (Match item in collection)
            {
                textValue += item.Value;
            }
            page.Content = textValue;
            return(true);
        }
        public override void Handle(ref Page page, IDownloader downloader, ISpider spider)
        {
            if (string.IsNullOrEmpty(page?.Content))
            {
                return;
            }

            string          textValue  = string.Empty;
            MatchCollection collection =
                Regex.Matches(page.Content, _pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);

            foreach (Match item in collection)
            {
                textValue += item.Value;
            }
            page.Content = textValue;
        }
Beispiel #31
0
 /// <summary>
 /// 执行注入Cookie的操作
 /// </summary>
 /// <param name="spider">需要注入Cookie的爬虫</param>
 /// <param name="pauseBeforeInject">注入Cookie前是否先暂停爬虫</param>
 public virtual void Inject(ISpider spider, bool pauseBeforeInject = true)
 {
     if (pauseBeforeInject)
     {
         spider.Pause(() =>
         {
             spider.Site.Cookies = GetCookies(spider);
             Logger.AllLog(spider.Identity, "Inject cookies success.", LogLevel.Info);
             spider.Contiune();
         });
     }
     else
     {
         spider.Site.Cookies = GetCookies(spider);
         Logger.AllLog(spider.Identity, "Inject cookies success.", LogLevel.Info);
     }
 }
 public virtual bool Handle(Page page, ISpider spider)
 {
     if (Stopper != null)
     {
         if (!Stopper.NeedStop(page, this))
         {
             page.AddTargetRequests(GenerateRequests(page));
             page.MissExtractTargetUrls = true;
         }
     }
     else
     {
         page.AddTargetRequests(GenerateRequests(page));
         page.MissExtractTargetUrls = true;
     }
     return(true);
 }
Beispiel #33
0
        /// <summary>
        /// 执行目标链接解析器
        /// </summary>
        /// <param name="page">页面数据</param>
        /// <param name="spider">爬虫</param>
        public override void Handle(ref Page page, ISpider spider)
        {
            if (_targetUrlsExtractor == null)
            {
                return;
            }
            var requests = _targetUrlsExtractor.ExtractRequests(page, spider.Site);

            foreach (var request in requests)
            {
                page.AddTargetRequest(request);
            }
            if (!_extractByProcessor)
            {
                page.SkipExtractTargetUrls = !page.SkipExtractTargetUrls || page.SkipExtractTargetUrls;
            }
        }
        public override int Process(string entityName, IEnumerable <dynamic> datas, ISpider spider)
        {
            int count = 0;

            if (EntityAdapters.TryGetValue(entityName, out var metadata))
            {
                string sql = string.Empty;

                switch (metadata.PipelineMode)
                {
                case PipelineMode.Insert:
                {
                    sql = metadata.InsertSql;
                    break;
                }

                case PipelineMode.InsertAndIgnoreDuplicate:
                {
                    sql = metadata.InsertAndIgnoreDuplicateSql;
                    break;
                }

                case PipelineMode.InsertNewAndUpdateOld:
                {
                    sql = metadata.InsertNewAndUpdateOldSql;
                    break;
                }

                case PipelineMode.Update:
                {
                    sql = metadata.UpdateSql;
                    break;
                }

                default:
                {
                    sql = metadata.InsertSql;
                    break;
                }
                }

                count = ExecuteHttpSql(sql, datas);
            }
            return(count);
        }
Beispiel #35
0
        public override void Process(IEnumerable <ResultItems> resultItems, ISpider spider)
        {
            var results = new List <DefaulHtmlContent>();

            foreach (var resultItem in resultItems)
            {
                results.Add(new DefaulHtmlContent
                {
                    Url   = resultItem.GetResultItem("url")?.ToString(),
                    Title = resultItem.GetResultItem("title")?.ToString(),
                    Html  = resultItem.GetResultItem("html")?.ToString()
                });
            }
            using (var conn = ConnectionStringSettings.CreateDbConnection())
            {
                conn.MyExecute($"INSERT IGNORE `{Database}`.`{TableName}` (`url`, `title`, `html`) VALUES (@Url, @Title, @Html);", results);
            }
        }
Beispiel #36
0
 /// <summary>
 /// 存储页面解析器解析到的数据结果到内存中
 /// </summary>
 /// <param name="entityName">爬虫实体类的名称</param>
 /// <param name="datas">实体类数据</param>
 /// <param name="spider">爬虫</param>
 /// <returns>最终影响结果数量(如数据库影响行数)</returns>
 protected override int Process(IModel model, IEnumerable <dynamic> datas, ISpider spider)
 {
     lock (_locker)
     {
         if (_collector.ContainsKey(model.Identity))
         {
             var list = _collector[model.Identity];
             list.AddRange(datas);
         }
         else
         {
             var list = new List <dynamic>();
             list.AddRange(datas);
             _collector.Add(model.Identity, list);
         }
         return(datas.Count());
     }
 }
        public override bool Handle(Page page, ISpider spider)
        {
            if (RedialLimit != 0)
            {
                lock (this)
                {
                    ++RequestedCount;

                    if (RedialLimit > 0 && RequestedCount == RedialLimit)
                    {
                        RequestedCount = 0;

                        ((IRedialExecutor)NetworkCenter.Current.Executor).Redial();
                    }
                }
            }
            return(true);
        }
Beispiel #38
0
        public override void Init(ISpider spider)
        {
            base.Init(spider);

            var md5 = Encrypt.Md5Encrypt(spider.Identity);

            ItemKey         += md5;
            SetKey          += md5;
            QueueKey         = md5;
            ErrorCountKey   += md5;
            SuccessCountKey += md5;
            IdentityMd5      = md5;

            NetworkProxyManager.Current.Execute("rds-in", () =>
            {
                Db.SortedSetAdd(TaskList, spider.Identity, (long)DateTimeUtils.GetCurrentTimeStamp());
            });
        }
Beispiel #39
0
 public override void Handle(ref Page page, ISpider spider)
 {
     if (!string.IsNullOrEmpty(page?.Content))
     {
         var content        = page.Content;
         var containContent = _contents.FirstOrDefault(c => content.Contains(c));
         if (containContent != null)
         {
             if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
             {
                 Logger.MyLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error);
                 spider.Exit();
             }
             page           = Spider.AddToCycleRetry(page.Request, spider.Site);
             page.Exception = new DownloadException($"Content downloaded contains string: {containContent}.");
         }
     }
 }
Beispiel #40
0
        public void Process(ResultItems resultItems, ISpider spider)
        {
            string path = BasePath + PathSeperator + spider.Identity + PathSeperator;

            try
            {
                FileInfo fileInfo = PrepareFile(path + Encrypt.Md5Encrypt(resultItems.Request.Url.ToString()) + ".html");
                using (StreamWriter writer = new StreamWriter(fileInfo.OpenWrite(), Encoding.UTF8))
                {
                    writer.WriteLine("url:\t" + resultItems.Request.Url);
                    writer.WriteLine("html:\t" + resultItems.GetResultItem("html"));
                }
            }
            catch (IOException e)
            {
                LogUtils.GetLogger(spider).Warn("write file error", e);
            }
        }
        public void Process(ResultItems resultItems, ISpider spider)
        {
            string path = $"{BasePath}{PathSeperator}{ spider.Identity}{PathSeperator}{Encrypt.Md5Encrypt(resultItems.Request.Url.ToString())}.json";

            try
            {
                FileInfo file = PrepareFile(path);
                using (StreamWriter printWriter = new StreamWriter(file.OpenWrite(), Encoding.UTF8))
                {
                    printWriter.WriteLine(JsonConvert.SerializeObject(resultItems.Results));
                }
            }
            catch (IOException e)
            {
                LogUtils.GetLogger(spider).Warn("write file error", e);
                throw;
            }
        }
        /// <summary>
        /// 当页面数据中的异常信息包含指定内容时触发ADSL拨号
        /// </summary>
        /// <param name="page">页面数据</param>
        /// <param name="spider">爬虫</param>
        public override void Handle(ref Page page, ISpider spider)
        {
            if (page == null || string.IsNullOrEmpty(page.Content) || string.IsNullOrWhiteSpace(page.Content) || page.Exception == null)
            {
                return;
            }
            if (page.Exception.Message.Contains(_exceptionMessage))
            {
                if (NetworkCenter.Current.Executor.Redial() == RedialResult.Failed)
                {
                    Logger.AllLog(spider.Identity, "Exit program because redial failed.", LogLevel.Error);
                    spider.Exit();
                }

                Spider.AddToCycleRetry(page.Request, spider.Site);
                page.Exception = new DownloadException("Download failed and redial finished already.");
            }
        }
Beispiel #43
0
 public override Task Process(Item item, ISpider spider)
 {
     switch (spider.Name)
     {
     case "Simple":
     case "Movie":
     case "TMaill":
     default:
         Console.WriteLine("-----------------------");
         foreach (var n in item.Data)
         {
             Console.WriteLine(string.Format("{0}:{1}", n.Key, n.Value));
         }
         Console.WriteLine("-----------------------");
         break;
     }
     return(base.Process(item, spider));
 }
Beispiel #44
0
        public IEnumerable <Estate> Acquire(ISpider spider)
        {
            //<strong class="font14"><span><a href="/c-xisilaigongguan8830/" target="_blank">西斯莱公馆</a>
            //<p><span>45973</span>元/平米</p>
            var regex   = new Regex("<strong\\s*class=\"font14\".*?<a\\s*href=\"(?<link>.*?)\".*?>(?<name>.*?)<\\/a>(.|\n)*?<p><span>(?<price>\\d+)</span>元/平米", RegexOptions.Compiled);
            var matches = regex.Matches(content);

            foreach (var match in matches)
            {
                var matchInType = (Match)match;
                var link        = SiteRoots.HomeLink + matchInType.Groups["link"].Value;
                var name        = matchInType.Groups["name"].Value;
                var priceText   = matchInType.Groups["price"].Value;
                yield return(new Estate {
                    Name = name, Price = TryParseDouble(priceText), Area = GetArea(spider, link)
                });
            }
        }
Beispiel #45
0
        public Page Download(Request request, ISpider spider)
        {
            // ReSharper disable once UnusedVariable
            string path = BasePath + "/" + spider.Identity + "/";
            Page page;
            try
            {
                FileInfo file = PrepareFile(path + Encrypt.Md5Encrypt(request.Url.ToString()));

                StreamReader bufferedReader = new StreamReader(file.OpenRead());
                string line = bufferedReader.ReadLine();
                if (("url:\t" + request.Url).Equals(line))
                {
                    string html = GetHtml(bufferedReader);
                    page = new Page(request, spider.Site.ContentType);
                    page.Url = request.Url.ToString();
                    page.Content = html;
                }
            }
            catch (IOException e)
            {
            #if !NET_CORE
                if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException)))
            #else
                if (typeof(FileNotFoundException).GetTypeInfo().IsAssignableFrom(e.GetType().GetTypeInfo()))
            #endif
                {
                    spider.Logger.Info("File not exist for url: " + request.Url);
                }
                else
                {
                    spider.Logger.Warn("File read error for url " + request.Url, e);
                }
            }
            page = DownloadWhenMiss(request, spider);
            return page;
        }
Beispiel #46
0
        public void Process(ResultItems resultItems, ISpider spider)
        {
            StringBuilder builer = new StringBuilder(BasePath);
            string filePath = $"{BasePath}{PathSeperator}{spider.Identity}{PathSeperator}{Encrypt.Md5Encrypt(resultItems.Request.Url.ToString())}.fd";
            try
            {
                FileInfo file = PrepareFile(filePath);
                using (StreamWriter printWriter = new StreamWriter(file.OpenWrite(), Encoding.UTF8))
                {
                    printWriter.WriteLine("url:\t" + resultItems.Request.Url);

                    foreach (var entry in resultItems.Results)
                    {
                        var value = entry.Value as IList;
                        if (value != null)
                        {
                            IList list = value;
                            printWriter.WriteLine(entry.Key + ":");
                            foreach (var o in list)
                            {
                                printWriter.WriteLine(o);
                            }
                        }
                        else
                        {
                            printWriter.WriteLine(entry.Key + ":\t" + entry.Value);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                spider.Logger.Warn("Write file error.", e);
                throw;
            }
        }
Beispiel #47
0
 public void Process(ResultItems resultItems, ISpider spider)
 {
     foreach (var entry in resultItems.Results)
     {
         Console.WriteLine($"{entry.Key}:{entry.Value}");
     }
 }
        public override Page Download(Request request, ISpider spider)
        {
            if (spider.Site == null)
            {
                return null;
            }

            Site site = spider.Site;

            var acceptStatCodes = site.AcceptStatCode;

            //Logger.InfoFormat("Downloading page {0}", request.Url);

            HttpResponseMessage response = null;
            var proxy = site.GetHttpProxyFromPool();
            request.PutExtra(Request.Proxy, proxy);
            int statusCode = 200;
            try
            {
                if (GeneratePostBody != null)
                {
                    SingleExecutor.Execute(() =>
                    {
                        GeneratePostBody(spider.Site, request);
                    });
                }

                var httpMessage = GenerateHttpRequestMessage(request, site);

                response = RedialManagerUtils.Execute("downloader-download", (m) =>
                {
                    var message = (HttpRequestMessage)m;
                    return httpClient.SendAsync(message).Result;
                }, httpMessage);

                AddRequestCount();

                response.EnsureSuccessStatusCode();
                if (!site.AcceptStatCode.Contains(response.StatusCode))
                {
                    throw new DownloadException($"下载 {request.Url} 失败. Code: {response.StatusCode}");
                }
                statusCode = (int)response.StatusCode;
                request.PutExtra(Request.StatusCode, statusCode);

                Page page = HandleResponse(request, response, statusCode, site);

                // need update
                page.TargetUrl = request.Url.ToString();

                //page.SetRawText(File.ReadAllText(@"C:\Users\Lewis\Desktop\taobao.html"));

                // 这里只要是遇上登录的, 则在拨号成功之后, 全部抛异常在Spider中加入Scheduler调度
                // 因此如果使用多线程遇上多个Warning Custom Validate Failed不需要紧张, 可以考虑用自定义Exception分开
                ValidatePage(page, spider);

                // 结束后要置空, 这个值存到Redis会导致无限循环跑单个任务
                //request.PutExtra(Request.CycleTriedTimes, null);

                //#if !NET_CORE
                //					httpWebRequest.ServicePoint.ConnectionLimit = int.MaxValue;
                //#endif

                return page;

                //正常结果在上面已经Return了, 到此处必然是下载失败的值.
                //throw new SpiderExceptoin("Download failed.");
            }
            catch (RedialException)
            {
                throw;
            }
            catch (Exception e)
            {
                Page page = new Page(request, site.ContentType) { Exception = e };

                ValidatePage(page, spider);
                throw;
            }
            finally
            {
                // 先Close Response, 避免前面语句异常导致没有关闭.
                try
                {
                    //ensure the connection is released back to pool
                    //check:
                    //EntityUtils.consume(httpResponse.getEntity());
                    response?.Dispose();
                }
                catch (Exception e)
                {
                    var logger = LogUtils.GetLogger(spider);
                    logger.Warn("Close response fail.", e);
                }
            }
        }
        public void Process(List<JObject> datas, ISpider spider)
        {
            RedialManagerUtils.Execute("pipeline-", () =>
            {
                switch (Mode)
                {
                    case PipelineMode.Insert:
                        {
                            using (var conn = CreateConnection())
                            {
                                var cmd = conn.CreateCommand();
                                cmd.CommandText = GetInsertSql();
                                cmd.CommandType = CommandType.Text;
                                conn.Open();

                                foreach (var data in datas)
                                {
                                    cmd.Parameters.Clear();

                                    List<DbParameter> parameters = new List<DbParameter>();
                                    foreach (var column in Columns)
                                    {
                                        var parameter = CreateDbParameter();
                                        parameter.ParameterName = $"@{column.Name}";
                                        parameter.Value = data.SelectToken($"{column.Name}")?.Value<string>();
                                        parameter.DbType = Convert(column.DataType);
                                        parameters.Add(parameter);
                                    }

                                    cmd.Parameters.AddRange(parameters.ToArray());
                                    cmd.ExecuteNonQuery();
                                }

                                conn.Close();
                            }
                            break;
                        }
                    case PipelineMode.Update:
                        {
                            using (var conn = CreateConnection())
                            {
                                var cmd = conn.CreateCommand();
                                cmd.CommandText = GetUpdateSql();
                                cmd.CommandType = CommandType.Text;
                                conn.Open();

                                foreach (var data in datas)
                                {
                                    cmd.Parameters.Clear();

                                    List<DbParameter> parameters = new List<DbParameter>();
                                    foreach (var column in UpdateColumns)
                                    {
                                        var parameter = CreateDbParameter();
                                        parameter.ParameterName = $"@{column.Name}";
                                        parameter.Value = data.SelectToken($"{column.Name}")?.Value<string>();
                                        parameter.DbType = Convert(column.DataType);
                                        parameters.Add(parameter);
                                    }

                                    foreach (var column in Primary)
                                    {
                                        var parameter = CreateDbParameter();
                                        parameter.ParameterName = $"@{column.Name}";
                                        parameter.Value = data.SelectToken($"{column.Name}")?.Value<string>();
                                        parameter.DbType = Convert(column.DataType);
                                        parameters.Add(parameter);
                                    }

                                    cmd.Parameters.AddRange(parameters.ToArray());
                                    cmd.ExecuteNonQuery();
                                }

                                conn.Close();
                            }
                            break;
                        }
                }

            });
        }
 public void Clear(ISpider spider)
 {
     Redis.KeyDelete(GetQueueKey(spider.Identity));
     Redis.KeyDelete(GetSetKey(spider.Identity));
     Redis.KeyDelete(GetItemKey(spider.Identity));
 }
Beispiel #51
0
 public Page Download(Request request, ISpider spider)
 {
     var page = new Page(request, ContentType.Html);
     page.Content = "";
     return page;
 }
        public int GetTotalRequestsCount(ISpider spider)
        {
            return RedialManagerUtils.Execute("rds-gettotalcount", () =>
            {
                long size = Redis.SetLength(GetSetKey(spider.Identity));

                return (int)size;
            });
        }
 public int GetLeftRequestsCount(ISpider spider)
 {
     return RedialManagerUtils.Execute("rds-getleftcount", () =>
     {
         long size = Redis.ListLength(GetQueueKey(spider.Identity));
         return (int)size;
     });
 }
        public override void Init(ISpider spider)
        {
            base.Init(spider);

            var md5 = Encrypt.Md5Encrypt(spider.Identity);
            ItemKey += md5;
            SetKey += md5;
            QueueKey = md5;
            ErrorCountKey += md5;
            SuccessCountKey += md5;
            IdentityMd5 = md5;

            RedialManagerUtils.Execute("rds-in", () =>
            {
                Db.SortedSetAdd(TaskList, spider.Identity, (long)DateTimeUtils.GetCurrentTimeStamp());
            });
        }
 public HudsonApiGraber(ISpider spider)
 {
     this.spider = spider;
 }
Beispiel #56
0
 public void Process(ResultItems resultItems, ISpider spider)
 {
     string path = BasePath + PathSeperator + spider.Identity + PathSeperator;
     try
     {
         FileInfo fileInfo = PrepareFile(path + Encrypt.Md5Encrypt(resultItems.Request.Url.ToString()) + ".html");
         using (StreamWriter writer = new StreamWriter(fileInfo.OpenWrite(), Encoding.UTF8))
         {
             writer.WriteLine("url:\t" + resultItems.Request.Url);
             writer.WriteLine("html:\t" + resultItems.GetResultItem("html"));
         }
     }
     catch (IOException e)
     {
         spider.Logger.Warn("Write file error.", e);
     }
 }
Beispiel #57
0
 public void Init(ISpider spider)
 {
     Spider = spider;
 }
Beispiel #58
0
 private Page DownloadWhenMiss(Request request, ISpider spider)
 {
     Page page = null;
     if (_downloaderWhenFileMiss != null)
     {
         page = _downloaderWhenFileMiss.Download(request, spider);
     }
     return page;
 }
 public virtual void Init(ISpider spider)
 {
     Spider = spider;
 }
Beispiel #60
0
 public string GetSpiderName(ISpider spider)
 {
     return spider.GetType().Name;
 }