예제 #1
0
        private void webBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            if (e.Url != this.webBrowser.Document.Url)
            {
                return;
            }
            string       encode = this.webBrowser.Document.Encoding;
            StreamReader sr     = new StreamReader(this.webBrowser.DocumentStream, Encoding.GetEncoding(encode));
            string       html   = sr.ReadToEnd();

            //Html2Article.LimitCount = 100;
            //Html2Article.Depth = 8;
            // 设置是否使用正文追加模式
            Html2Article.AppendMode = this.appendCheckBox.CheckState == CheckState.Checked;

            Stopwatch sw = new Stopwatch();

            sw.Start();
            // 将Html解析为Article结构化数据
            Article article = Html2Article.GetArticle(html);

            sw.Stop();
            msgLabel.Text = "提取耗时:" + Environment.NewLine + sw.ElapsedMilliseconds + "毫秒";

            this.publishDateTextBox.Text = article.PublishDate.ToString();
            this.titleTextBox.Text       = article.Title;
            this.contentTextBox.Text     = article.Content;

            string articleHtml = UrlUtility.FixUrl(this.urlTextBox.Text, article.ContentWithTags);

            this.contentWebBrowser.DocumentText = articleHtml;

            ResetState();
        }
예제 #2
0
        public string GetChapterContent(NVChapter nVChapter)
        {
            var     html    = HttpHelper.GetString(nVChapter.Link, baseEncoding);
            Article article = Html2Article.GetArticle(html);
            var     cSb     = new StringBuilder();

            cSb.AppendLine();
            cSb.Append(nVChapter.Title).AppendLine();
            cSb.Append(article.Content).AppendLine();
            return(cSb.ToString());
        }
예제 #3
0
        public string getUrl(string url)
        {
            HttpHelper http   = new HttpHelper();
            var        result = http.GetHtml(new HttpItem()
            {
                URL    = url,
                Method = "GET"
            });
            var client = NSoup.NSoupClient.Parse(result.Html);
            var h2     = Html2Article.GetArticle(result.Html);
            var html   = JsonConvert.SerializeObject(new { content = h2.Content, title = h2.Title });

            return(html);
        }
예제 #4
0
        public async void GenerateWebRequestAsync(WorkManage wm, int depth, IndexManager indexmanager)
        {
            try
            {
                webRequest           = (HttpWebRequest)WebRequest.Create(RequestUri); //创建Request实例
                webRequest.Method    = "GET";                                         //方法为GET
                webRequest.KeepAlive = true;                                          //持续型链接
                webRequest.Timeout   = 100;                                           //超时值为100ms

                webResponse = (HttpWebResponse)await webRequest.GetResponseAsync();   //获取当前请求的响应

                ContentStream = webResponse.GetResponseStream();                      //获取响应的字节流
                string   html     = GetContent();                                     //转化为HTML文本
                GetLinks getLinks = new GetLinks(html);
                Article  article  = new Article();
                Html2Article.AppendMode = false;
                Html2Article.Depth      = 80;
                article = Html2Article.GetArticle(html);

                wm.sum++;
                indexmanager.AddIndex(article.Title, article.Content, RequestUri);


                if (depth < wm.Depth)
                {
                    foreach (string uri in getLinks.GetUris())
                    {
                        if (!wm.unfinisheduri.ContainsKey(uri))
                        {
                            lock (wm.unfinisheduri)
                            {
                                wm.unfinisheduri.Add(uri, depth + 1);
                            }
                        }
                    }
                }
                webRequest.Abort();
                webResponse.Close();
            }
            catch
            {
            }
            //Thread.Sleep(timeSpan);
            //ThreadPool
        }
예제 #5
0
        static void Main(string[] args)
        {
            // 初始化log4net
            log4net.Config.XmlConfigurator.Configure(new FileInfo(Path.Combine(Application.StartupPath, "Config", "log4net.config")));
            FrmSettings frmSettings = new FrmSettings();

            if (frmSettings.ShowDialog() == DialogResult.OK)
            {
                var settings = frmSettings.Settings;
                var logger   = Log4netFactory.CreateLogger();
                //var unhandledLinks = WebPageDao.GetUnhandledLinks();

                Spider spider = new Spider(settings, logger, null);

                spider.AddUrlEvent += addUrlArgs =>
                {
                    //if (WebPageDao.IsIdExisted(MD5Helper.GetMD5HashCode(addUrlArgs.Url)))
                    //    return false;
                    //WebPageDao.SaveOrUpdateWebPage(addUrlArgs.Url, addUrlArgs.Depth);
                    Console.WriteLine(addUrlArgs.Title + " - " + addUrlArgs.Url);
                    return(true);
                };

                spider.DataReceivedEvent += receivedArgs =>
                {
                    //WebPage webPage = ArticleParse.GetArticleWebPage(receivedArgs.Html);
                    //webPage.Id = MD5Helper.GetMD5HashCode(receivedArgs.Url);
                    //webPage.Url = receivedArgs.Url;
                    //webPage.Depth = receivedArgs.Depth;
                    //webPage.InsertDate = DateTime.Now;
                    //webPage.Status = 1;
                    //WebPageDao.SaveOrUpdateWebPage(webPage);
                    MessageBox.Show(Html2Article.GetArticle(receivedArgs.Html));
                };

                spider.Crawl();
            }
        }
예제 #6
0
    private void StartSpider(string domian, string keyword)
    {
        if (spider != null)
        {
            spider.Stop();
        }

        var settings = new Settings();

        settings.InitSeeds  = domian;
        settings.LockHost   = true;
        settings.KeepCookie = true;
        settings.Threads    = 5;// 爬取线程数
        settings.UserAgent  = "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36";
        settings.Timeout    = 1000;
        settings.CrawlDepth = -1;    // 爬取深度,-1表示遍历整个网站
        settings.LimitSpeed = false; // 是否智能控制网速,选否则会全速下载网页

        Html2Article.AppendMode = true;
        Html2Article.LimitCount = 50;
        Html2Article.Depth      = 10;

        spider = new Spider(settings, null, null);
        //System.Diagnostics.Debug.WriteLine(spider);//////////////////***********
        spider.AddUrlEvent += addUrlArgs =>
        {
            return(true);
        };

        spider.DataReceivedEvent += receivedArgs =>
        {
            //System.Diagnostics.Debug.WriteLine(receivedArgs.Url);
            if (URLStackOperate(POPALL, null).IndexOf(receivedArgs.Url) == -1) // 属于新链接
            {
                Article article = Html2Article.GetArticle(receivedArgs.Html);
                if (article.Content.IndexOf(keyword) != -1)                                      // 文章包含用户输入的关键词
                {
                    if (article.ContentWithTags.IndexOf("img") != -1)                            // 文章包含图片
                    {
                        MatchCollection matches = ImgLinkRegex.Matches(article.ContentWithTags); // 取出所有img链接
                        foreach (Match match in matches)
                        {
                            string img = match.Groups["imgUrl"].Value;                             // 获得img链接
                            if (ImagesHtmlCodeStackOperate(POPALL, null, null).IndexOf(img) == -1) // 新图片
                            {
                                string HtmlCode = "<div class='grid'>"
                                                  + "<div class='imgholder'>"
                                                  + "<a href='" + receivedArgs.Url + "' target='_blank'>";
                                if (match.Groups["imgUrl"].Value.IndexOf("http") == 0)
                                {
                                    HtmlCode += "<img src='" + match.Groups["imgUrl"].Value + "'>";
                                }
                                else
                                {
                                    HtmlCode += "<img src='" + domian + match.Groups["imgUrl"].Value + "'>";
                                }
                                HtmlCode += "</a>"
                                            + "</div>"
                                            //+ "<strong>" + article.Title.Substring(0, 6) + "...</strong>"
                                            //+ "<p>" + article.Content.Substring(0, 20) + "...</p>"
                                            //+ "<div class='meta'>" + domian + "</div>"
                                            + "</div>";
                                ImagesHtmlCodeStackOperate(PUSH, HtmlCode, img);
                            }
                        }
                    }
                }
            }
            URLStackOperate(PUSH, receivedArgs.Url);
        };
        spider.Crawl();
    }
예제 #7
0
        public object GetData(string url)
        {
            DataSaver   dataSaver = new DataSaver();
            List <Page> docs      = new List <Page>();

            // 当前需要爬行的链接
            List <string> CurrentUrls = new List <string>();

            // 已经爬行过的链接
            HashSet <string> VisitedUrls = new HashSet <string>();


            string[] urlInfo   = url.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            string   schemaUrl = "http://" + urlInfo[0];

            //Uri可能转换失败
            try
            {
                Uri hostUrl = new Uri(schemaUrl);
                CurrentUrls.Add(schemaUrl);
                string site         = string.Empty;
                int    hasVisited   = 0;
                int    hasUrlsCount = 1;

                //如果当前拥有则爬行
                while (CurrentUrls.Count > 0)
                {
                    hasVisited++;
                    HashSet <string> newLinks = new HashSet <string>();
                    try
                    {
                        //2. 获取网页信息
                        Console.WriteLine(DateTime.Now.ToString() + "[" + Thread.CurrentThread.ManagedThreadId + "]" + ":Visit " + CurrentUrls[0]);
                        VisitedUrls.Add(CurrentUrls[0]);
                        bool isGetContentSuc = false;
                        Html2Article.ArticleDocument document = Html2Article.GetArticle(CurrentUrls[0], ref isGetContentSuc);
                        if (document != null && document.Content.Length > 10)
                        {
                            if (string.IsNullOrEmpty(site))
                            {
                                string[] titleArray = document.Title.Split(new char[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries);
                                site = titleArray[titleArray.Length - 1];
                            }
                            Page page = new Page();
                            page.Url       = CurrentUrls[0];
                            page.Site      = site;
                            page.Content   = document.Content;
                            page.Title     = document.Title;
                            page.Timestamp = DateTime.Now.ToString();// or UTC
                            docs.Add(page);
                            dataSaver.SavePage(ref docs, GetRootFolder() + "\\RawData");
                        }

                        //3. 获取新链接
                        if (document != null)
                        {
                            for (int j = 0; j < document.ChildrenLink.Count; j++)
                            {
                                try
                                {
                                    string link = document.ChildrenLink[j];
                                    if (link.Contains("#"))
                                    {
                                        link = link.Substring(0, link.IndexOf("#", System.StringComparison.Ordinal) - 1);
                                    }
                                    if (link.EndsWith("/"))
                                    {
                                        link = link.Substring(0, link.Length - 1);
                                    }
                                    string host = (new Uri(document.ChildrenLink[j])).Host;
                                    if (host == hostUrl.Host && !newLinks.Contains(link) &&
                                        !VisitedUrls.Contains(link))
                                    {
                                        newLinks.Add(link);
                                        VisitedUrls.Add(link);
                                    }
                                }
                                catch (Exception exception)
                                {
                                    Console.WriteLine(exception);
                                }
                            }
                        }
                    }
                    catch (Exception exception)
                    {
                        Console.WriteLine(exception);
                    }
                    CurrentUrls.RemoveAt(0);
                    if (newLinks.Count > 0)
                    {
                        CurrentUrls.AddRange(newLinks.ToArray());
                        hasUrlsCount += newLinks.Count;
                    }
                }
                if (docs.Count > 0)
                {
                    dataSaver.SavePage(ref docs, GetRootFolder() + "\\RawData");
                    docs.Clear();
                }
            }
            catch (Exception exception)
            {
                Console.WriteLine(exception);
            }

            return(true);
        }