private void webBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { if (e.Url != this.webBrowser.Document.Url) { return; } string encode = this.webBrowser.Document.Encoding; StreamReader sr = new StreamReader(this.webBrowser.DocumentStream, Encoding.GetEncoding(encode)); string html = sr.ReadToEnd(); //Html2Article.LimitCount = 100; //Html2Article.Depth = 8; // 设置是否使用正文追加模式 Html2Article.AppendMode = this.appendCheckBox.CheckState == CheckState.Checked; Stopwatch sw = new Stopwatch(); sw.Start(); // 将Html解析为Article结构化数据 Article article = Html2Article.GetArticle(html); sw.Stop(); msgLabel.Text = "提取耗时:" + Environment.NewLine + sw.ElapsedMilliseconds + "毫秒"; this.publishDateTextBox.Text = article.PublishDate.ToString(); this.titleTextBox.Text = article.Title; this.contentTextBox.Text = article.Content; string articleHtml = UrlUtility.FixUrl(this.urlTextBox.Text, article.ContentWithTags); this.contentWebBrowser.DocumentText = articleHtml; ResetState(); }
public string GetChapterContent(NVChapter nVChapter) { var html = HttpHelper.GetString(nVChapter.Link, baseEncoding); Article article = Html2Article.GetArticle(html); var cSb = new StringBuilder(); cSb.AppendLine(); cSb.Append(nVChapter.Title).AppendLine(); cSb.Append(article.Content).AppendLine(); return(cSb.ToString()); }
public string getUrl(string url) { HttpHelper http = new HttpHelper(); var result = http.GetHtml(new HttpItem() { URL = url, Method = "GET" }); var client = NSoup.NSoupClient.Parse(result.Html); var h2 = Html2Article.GetArticle(result.Html); var html = JsonConvert.SerializeObject(new { content = h2.Content, title = h2.Title }); return(html); }
public async void GenerateWebRequestAsync(WorkManage wm, int depth, IndexManager indexmanager) { try { webRequest = (HttpWebRequest)WebRequest.Create(RequestUri); //创建Request实例 webRequest.Method = "GET"; //方法为GET webRequest.KeepAlive = true; //持续型链接 webRequest.Timeout = 100; //超时值为100ms webResponse = (HttpWebResponse)await webRequest.GetResponseAsync(); //获取当前请求的响应 ContentStream = webResponse.GetResponseStream(); //获取响应的字节流 string html = GetContent(); //转化为HTML文本 GetLinks getLinks = new GetLinks(html); Article article = new Article(); Html2Article.AppendMode = false; Html2Article.Depth = 80; article = Html2Article.GetArticle(html); wm.sum++; indexmanager.AddIndex(article.Title, article.Content, RequestUri); if (depth < wm.Depth) { foreach (string uri in getLinks.GetUris()) { if (!wm.unfinisheduri.ContainsKey(uri)) { lock (wm.unfinisheduri) { wm.unfinisheduri.Add(uri, depth + 1); } } } } webRequest.Abort(); webResponse.Close(); } catch { } //Thread.Sleep(timeSpan); //ThreadPool }
static void Main(string[] args) { // 初始化log4net log4net.Config.XmlConfigurator.Configure(new FileInfo(Path.Combine(Application.StartupPath, "Config", "log4net.config"))); FrmSettings frmSettings = new FrmSettings(); if (frmSettings.ShowDialog() == DialogResult.OK) { var settings = frmSettings.Settings; var logger = Log4netFactory.CreateLogger(); //var unhandledLinks = WebPageDao.GetUnhandledLinks(); Spider spider = new Spider(settings, logger, null); spider.AddUrlEvent += addUrlArgs => { //if (WebPageDao.IsIdExisted(MD5Helper.GetMD5HashCode(addUrlArgs.Url))) // return false; //WebPageDao.SaveOrUpdateWebPage(addUrlArgs.Url, addUrlArgs.Depth); Console.WriteLine(addUrlArgs.Title + " - " + addUrlArgs.Url); return(true); }; spider.DataReceivedEvent += receivedArgs => { //WebPage webPage = ArticleParse.GetArticleWebPage(receivedArgs.Html); //webPage.Id = MD5Helper.GetMD5HashCode(receivedArgs.Url); //webPage.Url = receivedArgs.Url; //webPage.Depth = receivedArgs.Depth; //webPage.InsertDate = DateTime.Now; //webPage.Status = 1; //WebPageDao.SaveOrUpdateWebPage(webPage); MessageBox.Show(Html2Article.GetArticle(receivedArgs.Html)); }; spider.Crawl(); } }
private void StartSpider(string domian, string keyword) { if (spider != null) { spider.Stop(); } var settings = new Settings(); settings.InitSeeds = domian; settings.LockHost = true; settings.KeepCookie = true; settings.Threads = 5;// 爬取线程数 settings.UserAgent = "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36"; settings.Timeout = 1000; settings.CrawlDepth = -1; // 爬取深度,-1表示遍历整个网站 settings.LimitSpeed = false; // 是否智能控制网速,选否则会全速下载网页 Html2Article.AppendMode = true; Html2Article.LimitCount = 50; Html2Article.Depth = 10; spider = new Spider(settings, null, null); //System.Diagnostics.Debug.WriteLine(spider);//////////////////*********** spider.AddUrlEvent += addUrlArgs => { return(true); }; spider.DataReceivedEvent += receivedArgs => { //System.Diagnostics.Debug.WriteLine(receivedArgs.Url); if (URLStackOperate(POPALL, null).IndexOf(receivedArgs.Url) == -1) // 属于新链接 { Article article = Html2Article.GetArticle(receivedArgs.Html); if (article.Content.IndexOf(keyword) != -1) // 文章包含用户输入的关键词 { if (article.ContentWithTags.IndexOf("img") != -1) // 文章包含图片 { MatchCollection matches = ImgLinkRegex.Matches(article.ContentWithTags); // 取出所有img链接 foreach (Match match in matches) { string img = match.Groups["imgUrl"].Value; // 获得img链接 if (ImagesHtmlCodeStackOperate(POPALL, null, null).IndexOf(img) == -1) // 新图片 { string HtmlCode = "<div class='grid'>" + "<div class='imgholder'>" + "<a href='" + receivedArgs.Url + "' target='_blank'>"; if (match.Groups["imgUrl"].Value.IndexOf("http") == 0) { HtmlCode += "<img src='" + match.Groups["imgUrl"].Value + "'>"; } else { HtmlCode += "<img src='" + domian + match.Groups["imgUrl"].Value + "'>"; } HtmlCode += "</a>" + "</div>" //+ "<strong>" + article.Title.Substring(0, 6) + "...</strong>" //+ "<p>" + article.Content.Substring(0, 20) + "...</p>" //+ "<div class='meta'>" + domian + "</div>" + "</div>"; ImagesHtmlCodeStackOperate(PUSH, HtmlCode, img); } } } } } URLStackOperate(PUSH, receivedArgs.Url); }; spider.Crawl(); }
public object GetData(string url) { DataSaver dataSaver = new DataSaver(); List <Page> docs = new List <Page>(); // 当前需要爬行的链接 List <string> CurrentUrls = new List <string>(); // 已经爬行过的链接 HashSet <string> VisitedUrls = new HashSet <string>(); string[] urlInfo = url.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); string schemaUrl = "http://" + urlInfo[0]; //Uri可能转换失败 try { Uri hostUrl = new Uri(schemaUrl); CurrentUrls.Add(schemaUrl); string site = string.Empty; int hasVisited = 0; int hasUrlsCount = 1; //如果当前拥有则爬行 while (CurrentUrls.Count > 0) { hasVisited++; HashSet <string> newLinks = new HashSet <string>(); try { //2. 获取网页信息 Console.WriteLine(DateTime.Now.ToString() + "[" + Thread.CurrentThread.ManagedThreadId + "]" + ":Visit " + CurrentUrls[0]); VisitedUrls.Add(CurrentUrls[0]); bool isGetContentSuc = false; Html2Article.ArticleDocument document = Html2Article.GetArticle(CurrentUrls[0], ref isGetContentSuc); if (document != null && document.Content.Length > 10) { if (string.IsNullOrEmpty(site)) { string[] titleArray = document.Title.Split(new char[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries); site = titleArray[titleArray.Length - 1]; } Page page = new Page(); page.Url = CurrentUrls[0]; page.Site = site; page.Content = document.Content; page.Title = document.Title; page.Timestamp = DateTime.Now.ToString();// or UTC docs.Add(page); dataSaver.SavePage(ref docs, GetRootFolder() + "\\RawData"); } //3. 获取新链接 if (document != null) { for (int j = 0; j < document.ChildrenLink.Count; j++) { try { string link = document.ChildrenLink[j]; if (link.Contains("#")) { link = link.Substring(0, link.IndexOf("#", System.StringComparison.Ordinal) - 1); } if (link.EndsWith("/")) { link = link.Substring(0, link.Length - 1); } string host = (new Uri(document.ChildrenLink[j])).Host; if (host == hostUrl.Host && !newLinks.Contains(link) && !VisitedUrls.Contains(link)) { newLinks.Add(link); VisitedUrls.Add(link); } } catch (Exception exception) { Console.WriteLine(exception); } } } } catch (Exception exception) { Console.WriteLine(exception); } CurrentUrls.RemoveAt(0); if (newLinks.Count > 0) { CurrentUrls.AddRange(newLinks.ToArray()); hasUrlsCount += newLinks.Count; } } if (docs.Count > 0) { dataSaver.SavePage(ref docs, GetRootFolder() + "\\RawData"); docs.Clear(); } } catch (Exception exception) { Console.WriteLine(exception); } return(true); }