static void Main(string[] args) { Console.WriteLine("Starting..."); List <Uri> targets = Baseline.ToUriList(args); Baseline baseline = new Baseline(targets, 1500); Beatline beatline = new Beatline(); Console.WriteLine("Baseline contain(s) " + baseline.Count.ToString() + " node(s)"); ConsoleCrawlRecorder consoleListener = new ConsoleCrawlRecorder(); BeatCrawlRecorder recorder = new BeatCrawlRecorder(beatline); MultiplexCrawlRecorder listener = new MultiplexCrawlRecorder(new ICrawlRecorder[] { consoleListener, recorder }); Spider spider = new Spider(baseline, new Pruner(targets, listener, 10), listener); Console.WriteLine("Spider initialized."); do { spider.Crawl(); Console.WriteLine("--> Baseline contain(s) " + baseline.Count.ToString() + " node(s)"); Console.WriteLine("--> Beatline contain(s) " + beatline.Count.ToString() + " beat(s)"); if (beatline.Count > 0) { Console.WriteLine("--> Last beat contain(s) " + beatline[beatline.Count - 1].Count.ToString() + " node(s)"); } Console.WriteLine("--> press a key to exit"); System.Threading.Thread.Sleep(5000); } while (!Console.KeyAvailable); spider.Stop(); Console.WriteLine("End."); }
private static CrawlResults RunCrawler(Season season) { Log.InfoFormat("Crawler started at {0} for season {1}", DateTime.Now.ToShortTimeString(), season.Name); Spider crawler = new Spider(); CrawlResults results = crawler.Crawl(season); Log.InfoFormat("\n{0}", DumpResults(results)); Log.InfoFormat("Crawler finished at {0}.", DateTime.Now.ToShortTimeString()); return results; }
private static CrawlResults RunCrawler(Season season) { Log.InfoFormat("Crawler started at {0} for season {1}", DateTime.Now.ToShortTimeString(), season.Name); Spider crawler = new Spider(); CrawlResults results = crawler.Crawl(season); Log.InfoFormat("\n{0}", DumpResults(results)); Log.InfoFormat("Crawler finished at {0}.", DateTime.Now.ToShortTimeString()); return(results); }
void Crawl() { this.graph = spider.Crawl(); App.Current.Dispatcher.Invoke((Action) delegate { if (this.graph != null) { WriteLine("The web page " + this.graph.StartUrl + " has " + this.graph.GetEdgeCount() + " links."); WriteLine("Total webpages (vertices/nodes): " + this.graph.GetVertexCount() + " & Links (edges): " + this.graph.GetEdgeCount()); webgraphView.DrawGraph(this.graph); Graph1.LayoutAlgorithmType = webgraphView.LayoutAlgorithmType; Graph1.OverlapRemovalAlgorithmType = "FSA"; Graph1.HighlightAlgorithmType = "Simple"; Graph1.Graph = webgraphView.Graph; } else { WriteLine("Something went wrong. Graph is null!"); } }); }
static void Main(string[] args) { // 初始化log4net log4net.Config.XmlConfigurator.Configure(new FileInfo(Path.Combine(Application.StartupPath, "Config", "log4net.config"))); FrmSettings frmSettings = new FrmSettings(); if (frmSettings.ShowDialog() == DialogResult.OK) { var settings = frmSettings.Settings; var logger = Log4netFactory.CreateLogger(); //var unhandledLinks = WebPageDao.GetUnhandledLinks(); Spider spider = new Spider(settings, logger, null); spider.AddUrlEvent += addUrlArgs => { //if (WebPageDao.IsIdExisted(MD5Helper.GetMD5HashCode(addUrlArgs.Url))) // return false; //WebPageDao.SaveOrUpdateWebPage(addUrlArgs.Url, addUrlArgs.Depth); Console.WriteLine(addUrlArgs.Title + " - " + addUrlArgs.Url); return(true); }; spider.DataReceivedEvent += receivedArgs => { //WebPage webPage = ArticleParse.GetArticleWebPage(receivedArgs.Html); //webPage.Id = MD5Helper.GetMD5HashCode(receivedArgs.Url); //webPage.Url = receivedArgs.Url; //webPage.Depth = receivedArgs.Depth; //webPage.InsertDate = DateTime.Now; //webPage.Status = 1; //WebPageDao.SaveOrUpdateWebPage(webPage); MessageBox.Show(Html2Article.GetArticle(receivedArgs.Html)); }; spider.Crawl(); } }
static void Main(string[] args) { Console.Write("Enter a url to crawl: "); String urlString = Console.ReadLine(); if (urlString.StartsWith("www.") || !urlString.StartsWith("http://")) { urlString = "http://" + urlString; } //Console.WriteLine(WebPageInfo.GetPageTitleFromURL(urlString)); //Console.ReadLine(); Uri url; if (!Uri.TryCreate(urlString, UriKind.Absolute, out url)) { Console.WriteLine("Invalid URL!"); goto EndProgram; } var spider = new Spider(new SpiderData() { StartUrl = url, UrlType = UriKind.RelativeOrAbsolute, OnlySameDomain = true, MaxDepth = 3, AllowedContentTypes = new string[] { MediaTypeNames.Text.Html, MediaTypeNames.Text.Plain, MediaTypeNames.Text.Xml, MediaTypeNames.Application.Pdf }, IsDebugMode = false }); //spider.OnWebResponseEvent += (o, e) => //{ // if (e.Response == null) // { // Console.WriteLine("0 bytes received from " + e.Url.AbsoluteUri); // } // else // { // Console.WriteLine(e.Response.ContentLength + " bytes received from " + e.Url.AbsoluteUri); // } //}; //spider.OnAddEdgeEvent += (o, e) => //{ // Console.WriteLine("Added " + e.Edge.Target.Url.AbsoluteUri); //}; Graph graph = spider.Crawl(); Instance.PageRank = new PageRanker(graph); if (Instance.PageRank.Graph.Edges.Count() == 0) { Console.WriteLine("No Graph data!"); } else { IList <IEdge <WebPageInfo> > edges = Instance.PageRank.Graph.Edges.ToList(); StringBuilder sitemapData = new StringBuilder(); sitemapData.AppendLine(url + " - " + edges.Count() + " links."); for (int i = 0; i < edges.Count(); i++) { var edge = edges[i]; sitemapData.AppendLine((" " + (i + 1)).PadLeft(3) + ": " + edge.Target.Url.AbsoluteUri); } sitemapData.AppendLine(""); foreach (Edge <WebPageInfo> edge in edges) { //sitemapData.AppendLine("Source:"); //sitemapData.AppendLine(" > URL:\t\t" + edge.Source.Url.AbsoluteUri); //sitemapData.AppendLine(" > Title:\t" + edge.Source.PageTitle); //sitemapData.AppendLine("Target:"); //sitemapData.AppendLine(" > URL:\t\t" + edge.Target.Url.AbsoluteUri); //sitemapData.AppendLine(" > Title:\t" + edge.Target.PageTitle); //sitemapData.AppendLine(" > Outgoing links:" + edge.Target.OutgoingLinks.Count); sitemapData.AppendLine(edge.Target.Url.AbsoluteUri + " - " + edge.Target.OutgoingLinks.Count + " links."); for (int i = 0; i < edge.Target.OutgoingLinks.Count; i++) { var item = edge.Target.OutgoingLinks[i]; sitemapData.AppendLine((" " + (i + 1)).PadLeft(3) + ": " + item.AbsoluteUri); } sitemapData.AppendLine(""); } Instance.PageRank.Calculate(); //Calculate PageRank var pageRankData = new StringBuilder(); foreach (WebPageInfo pr in Instance.PageRank.PageRankList) { pageRankData.AppendLine(pr.Url.AbsoluteUri + ". PageRank: " + pr.Score); //PageRanker.TotalScore += pr.Score; } pageRankData.AppendLine("PageRank Total: " + Instance.PageRank.TotalScore); Console.WriteLine(sitemapData.ToString()); Console.WriteLine(pageRankData.ToString()); File.WriteAllText(Path.Combine(Environment.CurrentDirectory, "sitemap_" + DateTime.Now.Ticks + ".txt"), sitemapData.ToString()); File.WriteAllText(Path.Combine(Environment.CurrentDirectory, "pagerank_" + DateTime.Now.Ticks + ".txt"), pageRankData.ToString()); } EndProgram: Console.WriteLine("Press <any> key to exit..."); Console.ReadLine(); }
private void StartSpider(string domian, string keyword) { if (spider != null) { spider.Stop(); } var settings = new Settings(); settings.InitSeeds = domian; settings.LockHost = true; settings.KeepCookie = true; settings.Threads = 5;// 爬取线程数 settings.UserAgent = "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36"; settings.Timeout = 1000; settings.CrawlDepth = -1; // 爬取深度,-1表示遍历整个网站 settings.LimitSpeed = false; // 是否智能控制网速,选否则会全速下载网页 Html2Article.AppendMode = true; Html2Article.LimitCount = 50; Html2Article.Depth = 10; spider = new Spider(settings, null, null); //System.Diagnostics.Debug.WriteLine(spider);//////////////////*********** spider.AddUrlEvent += addUrlArgs => { return(true); }; spider.DataReceivedEvent += receivedArgs => { //System.Diagnostics.Debug.WriteLine(receivedArgs.Url); if (URLStackOperate(POPALL, null).IndexOf(receivedArgs.Url) == -1) // 属于新链接 { Article article = Html2Article.GetArticle(receivedArgs.Html); if (article.Content.IndexOf(keyword) != -1) // 文章包含用户输入的关键词 { if (article.ContentWithTags.IndexOf("img") != -1) // 文章包含图片 { MatchCollection matches = ImgLinkRegex.Matches(article.ContentWithTags); // 取出所有img链接 foreach (Match match in matches) { string img = match.Groups["imgUrl"].Value; // 获得img链接 if (ImagesHtmlCodeStackOperate(POPALL, null, null).IndexOf(img) == -1) // 新图片 { string HtmlCode = "<div class='grid'>" + "<div class='imgholder'>" + "<a href='" + receivedArgs.Url + "' target='_blank'>"; if (match.Groups["imgUrl"].Value.IndexOf("http") == 0) { HtmlCode += "<img src='" + match.Groups["imgUrl"].Value + "'>"; } else { HtmlCode += "<img src='" + domian + match.Groups["imgUrl"].Value + "'>"; } HtmlCode += "</a>" + "</div>" //+ "<strong>" + article.Title.Substring(0, 6) + "...</strong>" //+ "<p>" + article.Content.Substring(0, 20) + "...</p>" //+ "<div class='meta'>" + domian + "</div>" + "</div>"; ImagesHtmlCodeStackOperate(PUSH, HtmlCode, img); } } } } } URLStackOperate(PUSH, receivedArgs.Url); }; spider.Crawl(); }