Пример #1
0
 private void GetHtmlContent(string ip, Uri html)
 {
     try
     {
         IPChange.IPChange.SetProxyIP(ip);
         OriginHtml.Add(HtmlCrawler.GetHtmlContent(html));
     }
     catch (Exception ex)
     {
         MessageBox.Show(ex.ToString());
     }
 }
Пример #2
0
 /// <summary>
 /// 跟换IP 获取网页
 /// </summary>
 /// <param name="ip"></param>
 /// <param name="html"></param>
 private bool GetHtmlContent(string ip, string html)
 {
     try
     {
         IPChange.IPChange.SetProxyIP(ip);
         if (HtmlCrawler.GetHtmlContent(new Uri(html)) is string content)
         {
             OriginHtml.Add(content);
             return(true);
         }
         else
         {
             return(false);
         }
     }
     catch (Exception ex)
     {
         MessageBox.Show(ex.ToString());
         return(false);
     }
 }
Пример #3
0
        private void HomePageDatadownload()
        {
            //数据初始化
            StrartTime      = DateTime.Now;
            OriginHtml      = new List <string>();
            IsWorkNotRuning = false;
            Worker          = new BackgroundWorker
            {
                WorkerReportsProgress      = true, //支持进度信息获取
                WorkerSupportsCancellation = true  //支持任务终止
            };

            //信息准备
            links  = NovelList.Count;
            IPList = HtmlAnalysis.GetIPList(HtmlCrawler.GetHtmlContent(IPNet));


            Worker.DoWork += Work_DownloadHomePage;
            Worker.RunWorkerAsync(links);
            Worker.ProgressChanged    += Worker_DownloadHomePageProgressChanged;
            Worker.RunWorkerCompleted += Worker_RunDownloadHomePageCompleted;
        }
Пример #4
0
 public override object Convert(object value, Type targetType, object parameter, CultureInfo culture)
 {
     return(HtmlCrawler.GetHtmlImage(value as Uri));
 }
Пример #5
0
        public ReadWndViewModel(object data)
        {
            Book   = data as Novel;
            Reader = (Application.Current.MainWindow.DataContext as StartWinodwViewModel).Reader;

            GetSectionLinks(Book);

            SelectItemChangedCommand = new DelegateCommand <object>((p) =>
            {
                if (p is ListView listView)
                {
                    var htmlContent = HtmlCrawler.GetHtmlContent(Book.Sections[listView.SelectedIndex].Html);
                    CurrentContent  = HtmlAnalysis.AnalysisSectionContent(htmlContent);
                }
            });

            DownloadSectionsCommand = new DelegateCommand(() =>
            {
                SaveFileDialog dialog = new SaveFileDialog
                {
                    Filter     = "txt files(*.txt)|*.txt|word files(*.doc)|*.doc|All files(*.*)|*.*",
                    FileName   = Book.Name,
                    DefaultExt = "txt"
                };
                if (dialog.ShowDialog() == true)
                {
                    path = dialog.FileName;
                    Datadownload();
                }
            });

            int index = 0;

            if (Reader.Level >= 0)
            {
                foreach (var item in Reader.Books)
                {
                    if (Book.Id == item.BookID)
                    {
                        index = item.SectionIndex;
                        break;
                    }
                }
            }

            CurrentContent = HtmlAnalysis.AnalysisSectionContent(HtmlCrawler.GetHtmlContent(Book.Sections[(index - 1) >= 0 ? (index - 1) : 0].Html));

            SysFontFamilies = Fonts.SystemFontFamilies;

            WinCloseCommand = new DelegateCommand <object>((p) =>
            {
                (p as Window).Close();
                Application.Current.MainWindow.Show();
            });

            WinClosingCommand = new DelegateCommand <object>((p) =>
            {
                //若非访客模式
                if (reader.Level >= 0)
                {
                    //若书架上已存在该书 则更新本次阅读进度
                    if (index > 0)
                    {
                        if ((p as ListView).SelectedIndex < 0)
                        {
                            (p as ListView).SelectedIndex = index - 1;
                        }
                        Reader.Books.Find(b => b.BookID == Book.Id).SectionIndex = (p as ListView).SelectedIndex + 1;
                        ElasticSearch.ElasticHelper.Insert(Reader);
                    }
                    //若书架尚不存在此书 则新添加入列表
                    else
                    {
                        if (MessageBox.Show("是否加入书架?") == MessageBoxResult.OK)
                        {
                            if ((p as ListView).SelectedIndex < 0)
                            {
                                (p as ListView).SelectedIndex = index - 1;
                            }

                            Reader.Books.Add(new Model.Book()
                            {
                                BookID       = Book.Id,
                                SectionIndex = (p as ListView).SelectedIndex + 1
                            });
                            ElasticSearch.ElasticHelper.Insert(Reader);
                        }
                    }
                }
            });
        }
Пример #6
0
 private void GetSectionLinks(Novel novel)
 {
     novel.Sections = HtmlAnalysis.AnalysisDirectory(HtmlCrawler.GetHtmlContent(new System.Uri(novel.DirectoryUri)));
 }
Пример #7
0
        public override void Run()
        {
            RobotParser parser = new RobotParser();

            HtmlCrawler htmlCrawlerCNN = new HtmlCrawler(new HashSet <string>());

            HtmlCrawler htmlCrawlerNBA = new HtmlCrawler(new HashSet <string>());

            bool loading = false;

            bool crawling = false;

            bool idle = true;

            Trace.TraceInformation("WorkerRole1 is running");

            while (true)
            {
                Thread.Sleep(50);
                string status = "";
                if (idle == true)
                {
                    status = "Idle";
                }
                else if (crawling == true)
                {
                    status = "Crawling";
                }
                else if (loading == true)
                {
                    status = "Loading";
                }

                //add performance with no changes in queue size, index size, or number crawled
                var crawled   = 0;
                var sizeQueue = 0;
                var sizeIndex = 0;
                TableQuery <Performance> query3 = new TableQuery <Performance>()
                                                  .Take(1);

                foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                {
                    crawled   = item.NumCrawled;
                    sizeQueue = item.SizeQueue;
                    sizeIndex = item.SizeIndex;
                }

                Performance.insertPerformance(status, crawled, sizeQueue, sizeIndex);


                //Handle Command Queue
                CloudQueueMessage commandMessage = StorageManager.getCommandQueue().GetMessage(TimeSpan.FromMinutes(5));

                //In the case there is no more Urls to crawl, or at the beginning, this command message will be called
                if (commandMessage != null)
                {
                    StorageManager.getCommandQueue().DeleteMessage(commandMessage);

                    //command message is stop
                    if (commandMessage.AsString == "stop")
                    {
                        //clear queue and table
                        StorageManager.deleteAllQueues();
                        StorageManager.deleteTables();
                        //reset parser and crawler
                        parser = new RobotParser("");
                        htmlCrawlerCNN.crawlable = false;
                        htmlCrawlerCNN.Visited   = new HashSet <string>();
                        htmlCrawlerCNN.Disallow  = new HashSet <string>();

                        htmlCrawlerNBA.crawlable = false;
                        htmlCrawlerNBA.Visited   = new HashSet <string>();
                        htmlCrawlerNBA.Disallow  = new HashSet <string>();

                        loading  = false;
                        crawling = false;
                        idle     = true;

                        //add performance, clear queue sizes

                        Performance.insertPerformance("Idle", 0, 0, 0);
                    }

                    //command message is start
                    if (commandMessage.AsString.StartsWith("start:"))
                    {
                        crawling = false;
                        idle     = false;
                        loading  = true;

                        //add performance with no changes in queue size, index size, or number crawled
                        TableQuery <Performance> queryStart = new TableQuery <Performance>()
                                                              .Take(1);

                        foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(queryStart))
                        {
                            crawled   = item.NumCrawled;
                            sizeQueue = item.SizeQueue;
                            sizeIndex = item.SizeIndex;
                        }

                        Performance.insertPerformance("Loading", crawled, sizeQueue, sizeIndex);


                        ServicePointManager.Expect100Continue = true;
                        ServicePointManager.SecurityProtocol  = SecurityProtocolType.Tls12;

                        var robotFile = commandMessage.AsString.Substring(6);

                        string contents;
                        using (var wc = new System.Net.WebClient())
                        {
                            contents = wc.DownloadString(robotFile);
                        }

                        //create and parse through robots.txt
                        parser = new RobotParser(contents);

                        foreach (string filepath in parser.XMLFiles)
                        {
                            //only XMLs from cnn and nba
                            if (filepath.Contains("cnn") || filepath.Contains("nba"))
                            {
                                CloudQueueMessage filepathMessage = new CloudQueueMessage(filepath);
                                StorageManager.getXMLQueue().AddMessage(filepathMessage);
                            }
                        }

                        if (robotFile.Contains("cnn"))
                        {
                            htmlCrawlerCNN = new HtmlCrawler(parser.Disallow);
                        }

                        if (robotFile.Contains("bleacherreport"))
                        {
                            htmlCrawlerNBA = new HtmlCrawler(parser.Disallow);
                        }
                        //set the crawler with the disallows

                        Performance.insertPerformance("Idle", crawled, sizeQueue, sizeIndex);
                    }
                }


                //Handle XML Queue
                CloudQueueMessage XML = StorageManager.getXMLQueue().GetMessage(TimeSpan.FromMinutes(5));
                while (XML != null)
                {
                    if (XML.AsString.Contains("cnn.com"))
                    {
                        htmlCrawlerCNN.readXMLUrl(XML.AsString);
                    }
                    if (XML.AsString.Contains("bleacherreport.com"))
                    {
                        htmlCrawlerNBA.readXMLUrl(XML.AsString);
                    }

                    StorageManager.getXMLQueue().DeleteMessage(XML);
                    XML = StorageManager.getXMLQueue().GetMessage(TimeSpan.FromMinutes(5));
                }

                //Handle HTML Queue
                CloudQueueMessage HTML = StorageManager.getUrlQueue().GetMessage(TimeSpan.FromMinutes(5));
                if (HTML != null)
                {
                    //handle performance
                    if (htmlCrawlerCNN.crawlable || htmlCrawlerNBA.crawlable)
                    {
                        idle     = false;
                        loading  = false;
                        crawling = true;

                        //add performance, reduce queue size

                        TableQuery <Performance> queryCNN = new TableQuery <Performance>()
                                                            .Take(1);

                        foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                        {
                            crawled   = item.NumCrawled;
                            sizeQueue = item.SizeQueue - 1;
                            sizeIndex = item.SizeIndex;
                        }

                        Performance.insertPerformance("Crawling", crawled, sizeQueue, sizeIndex);
                    }
                    //handles if it is a cnn article
                    if (htmlCrawlerCNN.crawlable == true && HTML.AsString.Contains("cnn.com"))
                    {
                        htmlCrawlerCNN.parseHTML(HTML.AsString);
                        StorageManager.getUrlQueue().DeleteMessage(HTML);
                    }
                    //handles if it is a bleacher report article
                    else if (htmlCrawlerNBA.crawlable == true && HTML.AsString.Contains("bleacherreport.com"))
                    {
                        htmlCrawlerNBA.parseHTML(HTML.AsString);
                        StorageManager.getUrlQueue().DeleteMessage(HTML);
                    }
                }
            }
        }