Exemple #1
0
        static void Main(string[] args)
        {
            Console.WriteLine("Starting...");
            List <Uri> targets  = Baseline.ToUriList(args);
            Baseline   baseline = new Baseline(targets, 1500);
            Beatline   beatline = new Beatline();

            Console.WriteLine("Baseline contain(s) " + baseline.Count.ToString() + " node(s)");
            ConsoleCrawlRecorder   consoleListener = new ConsoleCrawlRecorder();
            BeatCrawlRecorder      recorder        = new BeatCrawlRecorder(beatline);
            MultiplexCrawlRecorder listener        = new MultiplexCrawlRecorder(new ICrawlRecorder[] { consoleListener, recorder });
            Spider spider = new Spider(baseline, new Pruner(targets, listener, 10), listener);

            Console.WriteLine("Spider initialized.");

            do
            {
                spider.Crawl();
                Console.WriteLine("--> Baseline contain(s) " + baseline.Count.ToString() + " node(s)");
                Console.WriteLine("--> Beatline contain(s) " + beatline.Count.ToString() + " beat(s)");
                if (beatline.Count > 0)
                {
                    Console.WriteLine("--> Last beat contain(s) " + beatline[beatline.Count - 1].Count.ToString() + " node(s)");
                }
                Console.WriteLine("--> press a key to exit");
                System.Threading.Thread.Sleep(5000);
            } while (!Console.KeyAvailable);

            spider.Stop();
            Console.WriteLine("End.");
        }
Exemple #2
0
        private static CrawlResults RunCrawler(Season season)
        {
            Log.InfoFormat("Crawler started at {0} for season {1}", DateTime.Now.ToShortTimeString(), season.Name);

            Spider crawler = new Spider();
            CrawlResults results = crawler.Crawl(season);
            Log.InfoFormat("\n{0}", DumpResults(results));

            Log.InfoFormat("Crawler finished at {0}.", DateTime.Now.ToShortTimeString());

            return results;
        }
Exemple #3
0
        private static CrawlResults RunCrawler(Season season)
        {
            Log.InfoFormat("Crawler started at {0} for season {1}", DateTime.Now.ToShortTimeString(), season.Name);

            Spider       crawler = new Spider();
            CrawlResults results = crawler.Crawl(season);

            Log.InfoFormat("\n{0}", DumpResults(results));

            Log.InfoFormat("Crawler finished at {0}.", DateTime.Now.ToShortTimeString());

            return(results);
        }
Exemple #4
0
        void Crawl()
        {
            this.graph = spider.Crawl();
            App.Current.Dispatcher.Invoke((Action) delegate
            {
                if (this.graph != null)
                {
                    WriteLine("The web page " + this.graph.StartUrl + " has " + this.graph.GetEdgeCount() + " links.");
                    WriteLine("Total webpages (vertices/nodes): " + this.graph.GetVertexCount() + " & Links (edges): " + this.graph.GetEdgeCount());

                    webgraphView.DrawGraph(this.graph);
                    Graph1.LayoutAlgorithmType         = webgraphView.LayoutAlgorithmType;
                    Graph1.OverlapRemovalAlgorithmType = "FSA";
                    Graph1.HighlightAlgorithmType      = "Simple";
                    Graph1.Graph = webgraphView.Graph;
                }
                else
                {
                    WriteLine("Something went wrong. Graph is null!");
                }
            });
        }
Exemple #5
0
        static void Main(string[] args)
        {
            // 初始化log4net
            log4net.Config.XmlConfigurator.Configure(new FileInfo(Path.Combine(Application.StartupPath, "Config", "log4net.config")));
            FrmSettings frmSettings = new FrmSettings();

            if (frmSettings.ShowDialog() == DialogResult.OK)
            {
                var settings = frmSettings.Settings;
                var logger   = Log4netFactory.CreateLogger();
                //var unhandledLinks = WebPageDao.GetUnhandledLinks();

                Spider spider = new Spider(settings, logger, null);

                spider.AddUrlEvent += addUrlArgs =>
                {
                    //if (WebPageDao.IsIdExisted(MD5Helper.GetMD5HashCode(addUrlArgs.Url)))
                    //    return false;
                    //WebPageDao.SaveOrUpdateWebPage(addUrlArgs.Url, addUrlArgs.Depth);
                    Console.WriteLine(addUrlArgs.Title + " - " + addUrlArgs.Url);
                    return(true);
                };

                spider.DataReceivedEvent += receivedArgs =>
                {
                    //WebPage webPage = ArticleParse.GetArticleWebPage(receivedArgs.Html);
                    //webPage.Id = MD5Helper.GetMD5HashCode(receivedArgs.Url);
                    //webPage.Url = receivedArgs.Url;
                    //webPage.Depth = receivedArgs.Depth;
                    //webPage.InsertDate = DateTime.Now;
                    //webPage.Status = 1;
                    //WebPageDao.SaveOrUpdateWebPage(webPage);
                    MessageBox.Show(Html2Article.GetArticle(receivedArgs.Html));
                };

                spider.Crawl();
            }
        }
Exemple #6
0
        static void Main(string[] args)
        {
            Console.Write("Enter a url to crawl: ");
            String urlString = Console.ReadLine();

            if (urlString.StartsWith("www.") || !urlString.StartsWith("http://"))
            {
                urlString = "http://" + urlString;
            }
            //Console.WriteLine(WebPageInfo.GetPageTitleFromURL(urlString));
            //Console.ReadLine();

            Uri url;

            if (!Uri.TryCreate(urlString, UriKind.Absolute, out url))
            {
                Console.WriteLine("Invalid URL!");
                goto EndProgram;
            }
            var spider = new Spider(new SpiderData()
            {
                StartUrl            = url,
                UrlType             = UriKind.RelativeOrAbsolute,
                OnlySameDomain      = true,
                MaxDepth            = 3,
                AllowedContentTypes = new string[] { MediaTypeNames.Text.Html, MediaTypeNames.Text.Plain, MediaTypeNames.Text.Xml, MediaTypeNames.Application.Pdf },
                IsDebugMode         = false
            });
            //spider.OnWebResponseEvent += (o, e) =>
            //{
            //    if (e.Response == null)
            //    {
            //        Console.WriteLine("0 bytes received from " + e.Url.AbsoluteUri);
            //    }
            //    else
            //    {
            //        Console.WriteLine(e.Response.ContentLength + " bytes received from " + e.Url.AbsoluteUri);
            //    }
            //};
            //spider.OnAddEdgeEvent += (o, e) =>
            //{
            //    Console.WriteLine("Added " + e.Edge.Target.Url.AbsoluteUri);
            //};
            Graph graph = spider.Crawl();

            Instance.PageRank = new PageRanker(graph);
            if (Instance.PageRank.Graph.Edges.Count() == 0)
            {
                Console.WriteLine("No Graph data!");
            }
            else
            {
                IList <IEdge <WebPageInfo> > edges = Instance.PageRank.Graph.Edges.ToList();
                StringBuilder sitemapData          = new StringBuilder();
                sitemapData.AppendLine(url + " - " + edges.Count() + " links.");
                for (int i = 0; i < edges.Count(); i++)
                {
                    var edge = edges[i];
                    sitemapData.AppendLine((" " + (i + 1)).PadLeft(3) + ": " + edge.Target.Url.AbsoluteUri);
                }
                sitemapData.AppendLine("");
                foreach (Edge <WebPageInfo> edge in edges)
                {
                    //sitemapData.AppendLine("Source:");
                    //sitemapData.AppendLine(" > URL:\t\t" + edge.Source.Url.AbsoluteUri);
                    //sitemapData.AppendLine(" > Title:\t" + edge.Source.PageTitle);
                    //sitemapData.AppendLine("Target:");
                    //sitemapData.AppendLine(" > URL:\t\t" + edge.Target.Url.AbsoluteUri);
                    //sitemapData.AppendLine(" > Title:\t" + edge.Target.PageTitle);
                    //sitemapData.AppendLine(" > Outgoing links:" + edge.Target.OutgoingLinks.Count);
                    sitemapData.AppendLine(edge.Target.Url.AbsoluteUri + " - " + edge.Target.OutgoingLinks.Count + " links.");

                    for (int i = 0; i < edge.Target.OutgoingLinks.Count; i++)
                    {
                        var item = edge.Target.OutgoingLinks[i];
                        sitemapData.AppendLine((" " + (i + 1)).PadLeft(3) + ": " + item.AbsoluteUri);
                    }
                    sitemapData.AppendLine("");
                }
                Instance.PageRank.Calculate(); //Calculate PageRank
                var pageRankData = new StringBuilder();
                foreach (WebPageInfo pr in Instance.PageRank.PageRankList)
                {
                    pageRankData.AppendLine(pr.Url.AbsoluteUri + ". PageRank: " + pr.Score);
                    //PageRanker.TotalScore += pr.Score;
                }
                pageRankData.AppendLine("PageRank Total: " + Instance.PageRank.TotalScore);

                Console.WriteLine(sitemapData.ToString());
                Console.WriteLine(pageRankData.ToString());
                File.WriteAllText(Path.Combine(Environment.CurrentDirectory, "sitemap_" + DateTime.Now.Ticks + ".txt"), sitemapData.ToString());
                File.WriteAllText(Path.Combine(Environment.CurrentDirectory, "pagerank_" + DateTime.Now.Ticks + ".txt"), pageRankData.ToString());
            }
EndProgram:
            Console.WriteLine("Press <any> key to exit...");
            Console.ReadLine();
        }
Exemple #7
0
    private void StartSpider(string domian, string keyword)
    {
        if (spider != null)
        {
            spider.Stop();
        }

        var settings = new Settings();

        settings.InitSeeds  = domian;
        settings.LockHost   = true;
        settings.KeepCookie = true;
        settings.Threads    = 5;// 爬取线程数
        settings.UserAgent  = "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36";
        settings.Timeout    = 1000;
        settings.CrawlDepth = -1;    // 爬取深度,-1表示遍历整个网站
        settings.LimitSpeed = false; // 是否智能控制网速,选否则会全速下载网页

        Html2Article.AppendMode = true;
        Html2Article.LimitCount = 50;
        Html2Article.Depth      = 10;

        spider = new Spider(settings, null, null);
        //System.Diagnostics.Debug.WriteLine(spider);//////////////////***********
        spider.AddUrlEvent += addUrlArgs =>
        {
            return(true);
        };

        spider.DataReceivedEvent += receivedArgs =>
        {
            //System.Diagnostics.Debug.WriteLine(receivedArgs.Url);
            if (URLStackOperate(POPALL, null).IndexOf(receivedArgs.Url) == -1) // 属于新链接
            {
                Article article = Html2Article.GetArticle(receivedArgs.Html);
                if (article.Content.IndexOf(keyword) != -1)                                      // 文章包含用户输入的关键词
                {
                    if (article.ContentWithTags.IndexOf("img") != -1)                            // 文章包含图片
                    {
                        MatchCollection matches = ImgLinkRegex.Matches(article.ContentWithTags); // 取出所有img链接
                        foreach (Match match in matches)
                        {
                            string img = match.Groups["imgUrl"].Value;                             // 获得img链接
                            if (ImagesHtmlCodeStackOperate(POPALL, null, null).IndexOf(img) == -1) // 新图片
                            {
                                string HtmlCode = "<div class='grid'>"
                                                  + "<div class='imgholder'>"
                                                  + "<a href='" + receivedArgs.Url + "' target='_blank'>";
                                if (match.Groups["imgUrl"].Value.IndexOf("http") == 0)
                                {
                                    HtmlCode += "<img src='" + match.Groups["imgUrl"].Value + "'>";
                                }
                                else
                                {
                                    HtmlCode += "<img src='" + domian + match.Groups["imgUrl"].Value + "'>";
                                }
                                HtmlCode += "</a>"
                                            + "</div>"
                                            //+ "<strong>" + article.Title.Substring(0, 6) + "...</strong>"
                                            //+ "<p>" + article.Content.Substring(0, 20) + "...</p>"
                                            //+ "<div class='meta'>" + domian + "</div>"
                                            + "</div>";
                                ImagesHtmlCodeStackOperate(PUSH, HtmlCode, img);
                            }
                        }
                    }
                }
            }
            URLStackOperate(PUSH, receivedArgs.Url);
        };
        spider.Crawl();
    }