示例#1
0
        public IActionResult About(string url)
        {
            var list = typeof(ScrapingXpath).GetProperties().Select(s => s.GetCustomAttributes(typeof(DisplayAttribute))).ToList().Select(c => ((DisplayAttribute)c.First()).Name).ToList();
            var web  = new HtmlWeb();

            var doc = web.Load(url)
            ;
            //foreach (var items in doc.DocumentNode.SelectNodes("//script"))
            //{
            //    items.Remove();
            //}

            //foreach (var htmlNode in doc.DocumentNode.ChildNodes.Where(x => x.XPath.Contains("#comment")).ToList())
            //{
            //    htmlNode.Remove();
            //}
            //foreach (var htmlNode in doc.DocumentNode.ChildNodes.Where(x => x.XPath.Contains("/#document")).ToList())
            //{
            //    htmlNode.Remove();
            //}
            //foreach (var selectNode in doc.DocumentNode.SelectNodes("//head/meta"))
            //{
            //    selectNode.Remove();
            //}
            var keyValues = new List <KeyValuePair <string, string> >();
            var model     = new CrawlerViewModel {
                KeyValuePairs = keyValues, HtmlDoc = doc.DocumentNode.InnerHtml, MapTags = list, Url = url
            };

            return(this.View(model));
        }
示例#2
0
        public IActionResult About()
        {
            var list  = typeof(ScrapingXpath).GetProperties().Select(s => s.GetCustomAttributes(typeof(DisplayAttribute))).ToList().Select(c => ((DisplayAttribute)c.First()).Name).ToList();
            var model = new CrawlerViewModel {
                MapTags = list
            };

            return(this.View(new CrawlerViewModel()));
        }
示例#3
0
 public ActionResult Default(CrawlerViewModel viewModel)
 {
     try
     {
         crawler.Crawl(viewModel);
     }
     catch (Exception ex)
     {
         viewModel.ErrorMsg = ex.Message;
     }
     return(View(viewModel));
 }
示例#4
0
        public CrawlerViewModel Crawl(CrawlerViewModel viewModel)
        {
            if (!Helper.IsValidUrl(viewModel.UrlToCrawl))
            {
                viewModel.ErrorMsg = String.Format(" Please enter mail adress");
                return(viewModel);
            }

            allLinksOnPage = new List <Uri>();
            CrawlConfiguration config = new CrawlerNetConfig().Initalize();

            this.crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += crawler_PageCrawlCompleted;

            //

            CrawlResult result = crawler.Crawl(new Uri(viewModel.UrlToCrawl));

            if (result.ErrorOccurred)
            {
                viewModel.ErrorMsg = String.Format("Crawler completed with error: {0}", result.ErrorException.Message);
            }

            var isProd = Convert.ToBoolean(ConfigurationManager.AppSettings["IsProd"].ToString());

            if (isProd)
            {
                viewModel.CrawledLinks.AddRange(allLinksOnPage);
            }
            else
            {
                viewModel.CrawledLinks.AddRange(allLinksOnPage.Take(10));
            }

            viewModel.SuccessMsg = " Successfully Listed !";

            return(viewModel);
        }
示例#5
0
 public MainWindow()
 {
     cvm = new CrawlerViewModel();
     this.DataContext = cvm;
     InitializeComponent();
 }
示例#6
0
        public CrawlerViewModel Crawler(
            Repository repository,
            string siteName,
            MediaFolder folder,
            string[] source,
            string catcherPathFormat)
        {
            var result = new CrawlerViewModel
            {
                state = "SUCCESS"
            };

            if (source == null || source.Length == 0)
            {
                result.state = "参数错误:没有指定抓取源";
                return(result);
            }
            result.list = source.Select(item =>
            {
                var crawler = new CrawlerItemViewModel();
                if (!IsExternalIPAddress(item))
                {
                    crawler.state = "INVALID_URL";
                    return(crawler);
                }
                crawler.source = item;
                var request    = HttpWebRequest.Create(item) as HttpWebRequest;
                using (var response = request.GetResponse() as HttpWebResponse)
                {
                    if (response.StatusCode != HttpStatusCode.OK)
                    {
                        crawler.state = "Url returns " + response.StatusCode + ", " + response.StatusDescription;
                        return(crawler);
                    }
                    if (response.ContentType.IndexOf("image") == -1)
                    {
                        crawler.state = "Url is not an image";
                        return(crawler);
                    }
                    try
                    {
                        var name = Path.GetFileName(item);
                        if (!Path.HasExtension(item))
                        {
                            name += ".jpg";
                        }
                        var fileName = PathFormatter.Format(name, catcherPathFormat);
                        var stream   = response.GetResponseStream();
                        var reader   = new BinaryReader(stream);
                        using (var ms = new MemoryStream())
                        {
                            byte[] buffer = new byte[4096];
                            int count;
                            while ((count = reader.Read(buffer, 0, buffer.Length)) != 0)
                            {
                                ms.Write(buffer, 0, count);
                            }
                            ms.Flush();
                            var res     = Upload(repository, siteName, folder, fileName, ms);
                            crawler.url = res.url;
                        }

                        crawler.state = "SUCCESS";
                    }
                    catch (Exception e)
                    {
                        crawler.state = "抓取错误:" + e.Message;
                    }
                    return(crawler);
                }
                return(crawler);
            });

            return(result);
        }