private void GetHtmlContent(string ip, Uri html) { try { IPChange.IPChange.SetProxyIP(ip); OriginHtml.Add(HtmlCrawler.GetHtmlContent(html)); } catch (Exception ex) { MessageBox.Show(ex.ToString()); } }
/// <summary> /// 跟换IP 获取网页 /// </summary> /// <param name="ip"></param> /// <param name="html"></param> private bool GetHtmlContent(string ip, string html) { try { IPChange.IPChange.SetProxyIP(ip); if (HtmlCrawler.GetHtmlContent(new Uri(html)) is string content) { OriginHtml.Add(content); return(true); } else { return(false); } } catch (Exception ex) { MessageBox.Show(ex.ToString()); return(false); } }
private void HomePageDatadownload() { //数据初始化 StrartTime = DateTime.Now; OriginHtml = new List <string>(); IsWorkNotRuning = false; Worker = new BackgroundWorker { WorkerReportsProgress = true, //支持进度信息获取 WorkerSupportsCancellation = true //支持任务终止 }; //信息准备 links = NovelList.Count; IPList = HtmlAnalysis.GetIPList(HtmlCrawler.GetHtmlContent(IPNet)); Worker.DoWork += Work_DownloadHomePage; Worker.RunWorkerAsync(links); Worker.ProgressChanged += Worker_DownloadHomePageProgressChanged; Worker.RunWorkerCompleted += Worker_RunDownloadHomePageCompleted; }
public override object Convert(object value, Type targetType, object parameter, CultureInfo culture) { return(HtmlCrawler.GetHtmlImage(value as Uri)); }
public ReadWndViewModel(object data) { Book = data as Novel; Reader = (Application.Current.MainWindow.DataContext as StartWinodwViewModel).Reader; GetSectionLinks(Book); SelectItemChangedCommand = new DelegateCommand <object>((p) => { if (p is ListView listView) { var htmlContent = HtmlCrawler.GetHtmlContent(Book.Sections[listView.SelectedIndex].Html); CurrentContent = HtmlAnalysis.AnalysisSectionContent(htmlContent); } }); DownloadSectionsCommand = new DelegateCommand(() => { SaveFileDialog dialog = new SaveFileDialog { Filter = "txt files(*.txt)|*.txt|word files(*.doc)|*.doc|All files(*.*)|*.*", FileName = Book.Name, DefaultExt = "txt" }; if (dialog.ShowDialog() == true) { path = dialog.FileName; Datadownload(); } }); int index = 0; if (Reader.Level >= 0) { foreach (var item in Reader.Books) { if (Book.Id == item.BookID) { index = item.SectionIndex; break; } } } CurrentContent = HtmlAnalysis.AnalysisSectionContent(HtmlCrawler.GetHtmlContent(Book.Sections[(index - 1) >= 0 ? (index - 1) : 0].Html)); SysFontFamilies = Fonts.SystemFontFamilies; WinCloseCommand = new DelegateCommand <object>((p) => { (p as Window).Close(); Application.Current.MainWindow.Show(); }); WinClosingCommand = new DelegateCommand <object>((p) => { //若非访客模式 if (reader.Level >= 0) { //若书架上已存在该书 则更新本次阅读进度 if (index > 0) { if ((p as ListView).SelectedIndex < 0) { (p as ListView).SelectedIndex = index - 1; } Reader.Books.Find(b => b.BookID == Book.Id).SectionIndex = (p as ListView).SelectedIndex + 1; ElasticSearch.ElasticHelper.Insert(Reader); } //若书架尚不存在此书 则新添加入列表 else { if (MessageBox.Show("是否加入书架?") == MessageBoxResult.OK) { if ((p as ListView).SelectedIndex < 0) { (p as ListView).SelectedIndex = index - 1; } Reader.Books.Add(new Model.Book() { BookID = Book.Id, SectionIndex = (p as ListView).SelectedIndex + 1 }); ElasticSearch.ElasticHelper.Insert(Reader); } } } }); }
private void GetSectionLinks(Novel novel) { novel.Sections = HtmlAnalysis.AnalysisDirectory(HtmlCrawler.GetHtmlContent(new System.Uri(novel.DirectoryUri))); }
public override void Run() { RobotParser parser = new RobotParser(); HtmlCrawler htmlCrawlerCNN = new HtmlCrawler(new HashSet <string>()); HtmlCrawler htmlCrawlerNBA = new HtmlCrawler(new HashSet <string>()); bool loading = false; bool crawling = false; bool idle = true; Trace.TraceInformation("WorkerRole1 is running"); while (true) { Thread.Sleep(50); string status = ""; if (idle == true) { status = "Idle"; } else if (crawling == true) { status = "Crawling"; } else if (loading == true) { status = "Loading"; } //add performance with no changes in queue size, index size, or number crawled var crawled = 0; var sizeQueue = 0; var sizeIndex = 0; TableQuery <Performance> query3 = new TableQuery <Performance>() .Take(1); foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3)) { crawled = item.NumCrawled; sizeQueue = item.SizeQueue; sizeIndex = item.SizeIndex; } Performance.insertPerformance(status, crawled, sizeQueue, sizeIndex); //Handle Command Queue CloudQueueMessage commandMessage = StorageManager.getCommandQueue().GetMessage(TimeSpan.FromMinutes(5)); //In the case there is no more Urls to crawl, or at the beginning, this command message will be called if (commandMessage != null) { StorageManager.getCommandQueue().DeleteMessage(commandMessage); //command message is stop if (commandMessage.AsString == "stop") { //clear queue and table StorageManager.deleteAllQueues(); StorageManager.deleteTables(); //reset parser and crawler parser = new RobotParser(""); htmlCrawlerCNN.crawlable = false; htmlCrawlerCNN.Visited = new HashSet <string>(); htmlCrawlerCNN.Disallow = new HashSet <string>(); htmlCrawlerNBA.crawlable = false; htmlCrawlerNBA.Visited = new HashSet <string>(); htmlCrawlerNBA.Disallow = new HashSet <string>(); loading = false; crawling = false; idle = true; //add performance, clear queue sizes Performance.insertPerformance("Idle", 0, 0, 0); } //command message is start if (commandMessage.AsString.StartsWith("start:")) { crawling = false; idle = false; loading = true; //add performance with no changes in queue size, index size, or number crawled TableQuery <Performance> queryStart = new TableQuery <Performance>() .Take(1); foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(queryStart)) { crawled = item.NumCrawled; sizeQueue = item.SizeQueue; sizeIndex = item.SizeIndex; } Performance.insertPerformance("Loading", crawled, sizeQueue, sizeIndex); ServicePointManager.Expect100Continue = true; ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12; var robotFile = commandMessage.AsString.Substring(6); string contents; using (var wc = new System.Net.WebClient()) { contents = wc.DownloadString(robotFile); } //create and parse through robots.txt parser = new RobotParser(contents); foreach (string filepath in parser.XMLFiles) { //only XMLs from cnn and nba if (filepath.Contains("cnn") || filepath.Contains("nba")) { CloudQueueMessage filepathMessage = new CloudQueueMessage(filepath); StorageManager.getXMLQueue().AddMessage(filepathMessage); } } if (robotFile.Contains("cnn")) { htmlCrawlerCNN = new HtmlCrawler(parser.Disallow); } if (robotFile.Contains("bleacherreport")) { htmlCrawlerNBA = new HtmlCrawler(parser.Disallow); } //set the crawler with the disallows Performance.insertPerformance("Idle", crawled, sizeQueue, sizeIndex); } } //Handle XML Queue CloudQueueMessage XML = StorageManager.getXMLQueue().GetMessage(TimeSpan.FromMinutes(5)); while (XML != null) { if (XML.AsString.Contains("cnn.com")) { htmlCrawlerCNN.readXMLUrl(XML.AsString); } if (XML.AsString.Contains("bleacherreport.com")) { htmlCrawlerNBA.readXMLUrl(XML.AsString); } StorageManager.getXMLQueue().DeleteMessage(XML); XML = StorageManager.getXMLQueue().GetMessage(TimeSpan.FromMinutes(5)); } //Handle HTML Queue CloudQueueMessage HTML = StorageManager.getUrlQueue().GetMessage(TimeSpan.FromMinutes(5)); if (HTML != null) { //handle performance if (htmlCrawlerCNN.crawlable || htmlCrawlerNBA.crawlable) { idle = false; loading = false; crawling = true; //add performance, reduce queue size TableQuery <Performance> queryCNN = new TableQuery <Performance>() .Take(1); foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3)) { crawled = item.NumCrawled; sizeQueue = item.SizeQueue - 1; sizeIndex = item.SizeIndex; } Performance.insertPerformance("Crawling", crawled, sizeQueue, sizeIndex); } //handles if it is a cnn article if (htmlCrawlerCNN.crawlable == true && HTML.AsString.Contains("cnn.com")) { htmlCrawlerCNN.parseHTML(HTML.AsString); StorageManager.getUrlQueue().DeleteMessage(HTML); } //handles if it is a bleacher report article else if (htmlCrawlerNBA.crawlable == true && HTML.AsString.Contains("bleacherreport.com")) { htmlCrawlerNBA.parseHTML(HTML.AsString); StorageManager.getUrlQueue().DeleteMessage(HTML); } } } }