public static void Main(String[] args) { WebCrawler crawler = new WebCrawler(); crawler.urlList.Add("http://data.kaohsiung.gov.tw/Opendata/List.aspx"); crawler.craw(); }
public void GetTextEmptyTest() { WebCrawler webpage = new WebCrawler(); string pagetext = webpage.GetText(""); Assert.AreEqual(pagetext, ""); }
public void GetTextNotEmptyTest() { WebCrawler webpage = new WebCrawler(); string pagetext = webpage.GetText("google.com"); Assert.IsTrue(!String.IsNullOrEmpty(pagetext)); }
public void TestWebCrawl() { var req = new WebCrawlJob() { SeedUrl = "http://stormwater.wef.org/2016/12/", Depth = 1, SourceId = 103, CrawlUrlPattern = new List <string>() { //"http://stormwater.wef.org/\\d.*/\\d.*/" }, IndexUrlPattern = new List <string>() { "http://stormwater.wef.org/\\d.*/\\d.*/.*", }, TitlePattern = new List <string>() { "\\body\\h1" }, SummaryPattern = "/meta[@property='og:description']", ContentPattern = new List <string>() { "/div[@id='content']" }, }; var crawler = new WebCrawler <WebCrawlBasePage, WebCrawlerSearchDoc>(req); //var results = crawler.Run(); }
private List <Product> ConvertSearchModelsToDomainModels(WebCrawler crawler) { var result = new List <Product>(); foreach (var product in crawler.Products) { result.Add(new Product { Name = product.Name, Description = product.Description, Category = new Category { CategoryName = product.Category }, Producer = new Producer { ProducerName = product.Producer }, Distributor = new Distributor { DistributorName = product.Seller }, SourceUrl = product.SourceUrl, TimeStamp = product.TimeStamp, Value = product.Value, OnSale = product.OnSale, SaleDeadline = product.SaleDeadline, SaleDescription = product.SaleDescription, SaleValue = product.SaleValue }); } return(result); }
private async void procurar_Click(object sender, EventArgs e) { if (CheckTextBox()) { string plat = ""; if (pcRadio.Checked) { plat = "pc"; } else if (psRadio.Checked) { plat = "psn"; } else if (xboxRadio.Checked) { plat = "xbox"; } await WebCrawler.StartCrawlerAsync(plat, idTextBox.Text.Trim()); if (!IdController.Existe) { string message = "Id de jogador não encontrado"; string caption = "Erro"; alerts.Alert(message, caption); } else { IdController.ContaName = idTextBox.Text.Trim(); TrocarForm(); } } }
static void Memento(string path) { CareTaker careTaker = CareTaker.RestoreFromFile(); Originator originator = null; WebCrawler webcrawler = WebCrawler.GetInstance(); if (careTaker != null) { originator = careTaker.Originator; } if (careTaker == null) { var model = webcrawler.LoadFromFile(path); originator = new Originator(model); careTaker = new CareTaker(originator); } //careTaker.Restore("nkbrrk"); //Console.WriteLine($"Originator stateName {originator.StateName}"); careTaker.Compare("citmxv"); //originator.MakeChanges(webcrawler); //await careTaker.Save(); //Console.WriteLine("History"); //careTaker.ShowHistory(); }
public async Task <CrawlResult> GetWebCrawlingResultAsync() { using (WebCrawler webCrawler = new WebCrawler()) { return(await webCrawler.PerformCrawlingAsync(ConfigData.Depth, ConfigData.RootResources)); } }
public ActionResult ReadAds(Parameters parameters) { if (!IsLogged()) { return(Unauthorized()); } var urlParameters = new Dictionary <string, string> { { "q", parameters.ProductName } }; var baseUrl = string.IsNullOrEmpty(parameters.SearchRegion) ? "https://olx.com.br/brasil" : $"https://{parameters.SearchRegion}.olx.com.br"; var webCrawler = new WebCrawler(baseUrl, urlParameters); var ads = webCrawler.Read(); foreach (var ad in ads) { _adService.Create(ad); } return(NoContent()); }
public CrawlerReport AnalyseSite(dynamic site) { ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12; var startUrl = new Uri(site.Url); var crawlerSettings = BuildCrawlerSettings(startUrl); var crawler = new WebCrawler(crawlerSettings); try { crawler.Start(); while (crawler.IsRunning) { Thread.Sleep(500); } crawler.Report.Save(BuildReportPath()); } catch (Exception ex) { //logging needs added here throw new ApplicationException(string.Format("Error Crawling {0}", startUrl)); } return(crawler.Report); }
private void downloadWorker_DoWork(object sender, DoWorkEventArgs e) { BackgroundWorker backgroundWorker = (BackgroundWorker)sender; backgroundWorker.ReportProgress(0); SortedSet <ScriptureReference> allReferences = new SortedSet <ScriptureReference>(); List <string> sitesToSearch = new List <string>(websitesListBox.CheckedItems.OfType <string>()); int i = 0; foreach (string page in sitesToSearch) { List <ScriptureReference> found = WebCrawler.SearchPageForScriptures(page, out _); // Remove references to whole chapters found.RemoveAll(x => x.isChapter); allReferences.UnionWith(found); i++; backgroundWorker.ReportProgress((int)(100 * ((float)i / sitesToSearch.Count))); } e.Result = allReferences; }
public async Task <ApkName> GetOrCreateApkName(string packageName) { var a = _con.ApkNameCacheTable.Where(e => e.PackageName.Equals(packageName)); if (await a.CountAsync() > 0) { return(await a.FirstOrDefaultAsync()); } var webCrawler = new WebCrawler(); var apkName = await webCrawler.GetName(packageName); if (!string.IsNullOrEmpty(apkName)) { var cacheTable = new ApkNameCacheTable { Name = apkName, PackageName = packageName }; _con.ApkNameCacheTable.Update(cacheTable); await _con.SaveChangesAsync(); return(cacheTable); } return(null); }
public void CAN_EXTRACT_BASE_URL_FROM_URL_WITH_FILE_IN_SUBDIRECTORY() { var objectUnderTest = new WebCrawler(BASE_URL_WITH_FILE_IN_SUBDIRECTORY); var result = objectUnderTest.ExtractBaseUrl(BASE_URL_WITH_FILE_IN_SUBDIRECTORY); Assert.AreEqual(BASE_URL, result); }
public void CAN_EXTRACT_NOT_CLOSED_BASE_URL() { var objectUnderTest = new WebCrawler(BASE_URL_NOT_CLOSED); var result = objectUnderTest.ExtractBaseUrl(BASE_URL_NOT_CLOSED); Assert.AreEqual(BASE_URL, result); }
private void AddCover() { try { byte[] cover = WebCrawler.DownloadCover(lnParameters.urlCover); PdfImage pic = PdfImage.GetInstance(cover); if (pic.Height > pic.Width) { //Maximum height is 800 pixels. float percentage = 0.0f; percentage = 700 / pic.Height; pic.ScalePercent(percentage * 100); } else { //Maximum width is 600 pixels. float percentage = 0.0f; percentage = 540 / pic.Width; pic.ScalePercent(percentage * 100); } pic.Border = Rectangle.BOX; pic.BorderColor = BaseColor.BLACK; pic.BorderWidth = 3f; pdf.NewPage(); pdf.Add(pic); } catch (CoverException) {} }
public void RETURNS_READABLE_URL_IN_CASE_OF_INVALID_URL() { var objectUnderTest = new WebCrawler(INVALID_BASE_URL); var result = objectUnderTest.ExtractBaseUrl(INVALID_BASE_URL); Assert.AreEqual(READABLE_INVALID_BASE_URL, result); }
public void RETURNS_VALID_URL_WITH_PORT() { var objectUnderTest = new WebCrawler(URL_WITH_PORT); var result = objectUnderTest.ExtractBaseUrl(URL_WITH_PORT); Assert.AreEqual(URL_WITH_PORT, result); }
public void When_HtmlHasSeveralHyperlinks_Then_ReturnSeveralChildrenNodes() { // Arrange ArrangeMocks(DomainBase, new List <Node>() { HomeNode, AboutNode, ContactNode }); ArrangeMocks(HomeNode.Uri, new List <Node>() { }); ArrangeMocks(AboutNode.Uri, new List <Node>() { }); ArrangeMocks(ContactNode.Uri, new List <Node>() { }); var target = new WebCrawler(_factoryMock.Object, _httpMock.Object); //Act var result = target.Crawl(DomainBase); // Assert Assert.AreEqual(3, result.Nodes.Count); Assert.AreEqual(HomeNode.Uri, result.Nodes[0].Uri); Assert.AreEqual(AboutNode.Uri, result.Nodes[1].Uri); Assert.AreEqual(ContactNode.Uri, result.Nodes[2].Uri); }
public void When_PageContainsVisitedNode_Then_DontRepeatVisitedNode() { // Arrange ArrangeMocks(DomainBase, new List <Node>() { HomeNode, AboutNode }); ArrangeMocks(HomeNode.Uri, new List <Node>() { BaseNode }); ArrangeMocks(AboutNode.Uri, new List <Node>() { BaseNode }); var target = new WebCrawler(_factoryMock.Object, _httpMock.Object); //Act var result = target.Crawl(DomainBase); // Assert Assert.AreEqual(2, result.Nodes.Count); Assert.AreEqual(HomeNode.Uri, result.Nodes[0].Uri); Assert.AreEqual(AboutNode.Uri, result.Nodes[1].Uri); Assert.AreEqual(0, result.Nodes[0].Nodes[0].Nodes.Count); Assert.AreEqual(0, result.Nodes[1].Nodes[0].Nodes.Count); }
public void TestRemoveScriptSestionFromHTML() { var crawler = new WebCrawler(); string inputHTML = "<html><script>some javascript</script><body><script>some javascript</script></body></html>"; string requiredHTML = "<html><body></body></html>"; Assert.IsTrue(crawler.RemoveTagsFromHTML(inputHTML, "script") == requiredHTML); }
public void DoJobTest() { var crawler = new WebCrawler { InitLink = "http://dantri.com.vn" }; crawler.DoJob(); }
public void TestRemoveStyleSestionFromHTML() { var crawler = new WebCrawler(); string inputHTML = "<html><style>some tyle</style><body><style>some style</style></body></html>"; string requiredHTML = "<html><body></body></html>"; Assert.IsTrue(crawler.RemoveTagsFromHTML(inputHTML, "style") == requiredHTML); }
static void Main(string[] args) { ICrawlingFilterDetail crawlingFilterDetail = new CrawlingFilterDetail("jobdetail-iframe", "src", "/jobdetail"); ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, crawlingFilterDetail); IResultWriter resultWriter = new ResultWriter(crawlingStats); var walter = new WebCrawler(crawlingStats, resultWriter, new Clock()); var result = walter.Crawl(new Uri("https://www.xn--jobbrse-d1a.com/list/jobtitle/"), @"c:\temp\WalterResult.csv"); }
public void MakeChanges(WebCrawler webCrawler) { var model = webCrawler.LoadFromFile(MyHtmlModel.Name); MyHtmlModel = model; StateName = GenerateHashSum(); Console.WriteLine($"Originator StateName changed to {StateName}"); }
private ConsoleCrawlerApp CreateTarget() { var crawler = new WebCrawler(new NodeFactory(new LinkExtractor()), _httpMock.Object); var output = new ConsoleOutputHandler(new TextOutputGenerator(), _fileMock.Object, _consoleMock.Object); var consoleApp = new ConsoleCrawlerApp(new ConsoleInputHandler(_clockMock.Object), output, crawler); return(consoleApp); }
public WebCrawlerVM() { ConfigReader.Instance.Read(); maxNestity = ConfigReader.Instance.MaxCrawlNestity; rootResources = ConfigReader.Instance.RootResources.ToArray(); webCrawler = new WebCrawler(maxNestity); Clicks = 0; }
// GET: HomePage public ActionResult Index() { var webCrawler = new WebCrawler(); webCrawler.SearchWeb(); return(View()); }
/// <summary> /// Creates a new instance of the <see cref="Buildit.Crawler.Console.ConsoleCrawlerApp"/> class. /// </summary> /// <returns>A new <see cref="Buildit.Crawler.Console.ConsoleCrawlerApp"/> object created.</returns> public static IConsoleApp Create() { var crawler = new WebCrawler(new NodeFactory(new LinkExtractor()), new SystemHttp()); var output = new ConsoleOutputHandler(new TextOutputGenerator(), new SystemFile(), new SystemConsole()); var consoleApp = new ConsoleCrawlerApp(new ConsoleInputHandler(new SystemClock()), output, crawler); return(consoleApp); }
static void Main(string[] args) { #region 调试程序 //FansCrawler fansCrawler = new FansCrawler(1, 1); ////fansCrawler.GetHtmlFromWeiBo(1); //fansCrawler.ReadInHtmlContent(); //List<Fan> fansList = new List<Fan>(); //StreamReader reader = new StreamReader("content.html"); //string content = reader.ReadToEnd(); //fansCrawler.currentHtmlContent = content; //fansCrawler.GetInfoFromHtml(fansList); //OutputFansList(fansList); #endregion #region 正式爬取微博程序 User user = new User(); ICrawler crawler = null; int taskCount = 278; for (int i = 1; i <= taskCount; i = i + 25) { if (i + 24 <= taskCount) { //第二个参数0表示爬取个人主页的微博,1表示爬取自己首页的微博 //25是通过实践得到的比较可靠的数字,当单个爬取程序爬取页面数大于25时可能被服务器屏蔽 crawler = new WebCrawler(user, PageType.PersonalPage, i, 25); } else { crawler = new WebCrawler(user, PageType.PersonalPage, i, taskCount - i + 1); } crawler.RunCrawler(user.FeedList); } Output(user, crawler.Name); #endregion #region 正式爬取粉丝程序 //List<Fan> fansList = new List<Fan>(); //FansAndFollowCrawler fansCrawler = null; //int taskCountForFansPage =58; //RelateType type = RelateType.Follow; //for (int i = 1; i <= taskCountForFansPage; i = i + 25) //{ // if (i + 24 <= taskCountForFansPage) // { // fansCrawler = new FansAndFollowCrawler(type, i, 25); // } // else // { // fansCrawler = new FansAndFollowCrawler(type, i, taskCountForFansPage - i + 1); // } // fansCrawler.RunCrawler(fansList); //} //Output(fansList, fansCrawler.Name); //Console.WriteLine("关注总数:" + fansList.Count); #endregion }
public void getPropertiesTest() { //Arrange WebCrawler crawler = new WebCrawler("http://www.bbc.co.uk/news", 2); //Assert Assert.AreEqual("http://www.bbc.co.uk/news", crawler.root); Assert.AreEqual(2, crawler.maxDepth); }
static void Main(string[] args) { Console.WriteLine("Welcome to WebCrawler Mangakakalot"); Console.WriteLine("Do you want to collect data from web page?\nY|N"); string ansr = Console.ReadLine().ToLower(); if (ansr != "y") { return; } Console.WriteLine("With great waiting times, comes great loss of patience..."); //Get data from web page into a list based off a model WebCrawler spider = new WebCrawler(); List <Popular> pops = spider.startCrawler().GetAwaiter().GetResult(); bool smartUser = false; while (!smartUser) { Console.WriteLine("======================"); Console.WriteLine("What to to with all the collected info?"); Console.WriteLine("1 - Upload into DataBase"); Console.WriteLine("2 - Download From DataBase"); int opt = Convert.ToInt32(Console.ReadLine()); switch (opt) { case 1: //Putting each Popular into the database SpiderInsert spider1 = new SpiderInsert(); foreach (Popular pop in pops) { spider1.SetIntoDB(pop); } Console.WriteLine("Upload Action Completed"); spider1.CloseBD(); smartUser = true; break; case 2: Console.WriteLine("Not yet Implemented"); smartUser = true; break; default: Console.WriteLine("Option not calibrated by the system"); continue; } } Console.WriteLine("Press Enter to exit program..."); ConsoleKeyInfo keyInfo = Console.ReadKey(true); if (keyInfo.Key == ConsoleKey.Enter) { System.Environment.Exit(0); } }
static void Main(string[] args) { var directoryName = Environment.ExpandEnvironmentVariables(@"%USERPROFILE%\personal\gazeta\krasev"); //var directoryName = @"D:\gazeta\krasev"; var webCrawler = new WebCrawler(directoryName); const String baseUrl = "http://www.booksite.ru/krassever/"; var years = webCrawler.ExtractAll(baseUrl + "index.htm", @" href=""(\d\d\d\d\..+?)"""); foreach (var year in years) { var yearUrl = baseUrl + year; var issues = webCrawler.ExtractAll(yearUrl, @"<a href=""(\d\d\d\d/(?:\w+/)?\d\d\d\d_\d+\.pdf)"); foreach (var issue in issues) { var pdfUrl = baseUrl + issue; var directory = webCrawler.Parse(pdfUrl, @"/(\d\d\d\d)/"); var number = webCrawler.Parse(pdfUrl, @"_(\d+)\.").PadLeft(3, '0'); var name = webCrawler.Parse(pdfUrl, @"/\d\d\d\d/(?:(\w+)/)\d\d\d\d_"); if (!String.IsNullOrEmpty(name)) { if (name.Equals("izvestya")) { name = "A_"; } else if (name.Equals("krassever")) { name = "B_"; } else { throw new Exception(); } } var fileName = String.Format(@"{0}\{0}_{2}{1}.pdf", directory, number, name); Console.WriteLine("{0} => {1}", pdfUrl, fileName); webCrawler.AddFile(pdfUrl, fileName); } } }
static void Main(string[] args) { //var directoryName = Environment.ExpandEnvironmentVariables(@"%USERPROFILE%\personal\gazeta\sovsib"); var directoryName = @"D:\gazeta\sovsib"; var webCrawler = new WebCrawler(directoryName); const String baseUrl = "http://elib.ngonb.ru"; var years = webCrawler.ExtractAll("http://elib.ngonb.ru/jspui/handle/NGONB/32", @"<option value=""NGONB/(\d+)"">\d\d\d\d</option>"); foreach (var year in years) { var url = String.Format("http://elib.ngonb.ru/jspui/handle/NGONB/{0}/browse?type=dateissued&submit_browse=Issue+Date", year); while (true) { var issues = webCrawler.ExtractAll(url, @"<a href=""/jspui/handle/NGONB/(\d+)"">(.*?)</a></td>"); foreach (var issue in issues) { var issueUrl = "http://elib.ngonb.ru/jspui/handle/NGONB/" + issue; var pdfUrl = webCrawler.Extract(issueUrl, @"""(/jspui/bitstream/NGONB/" + issue + @"(?:.+?).pdf)"""); if (!String.IsNullOrEmpty(pdfUrl)) { pdfUrl = baseUrl + pdfUrl; var date = webCrawler.Extract(issueUrl, @">(\d\d\d\d-\d\d-\d\d)<"); var directory = date.Substring(0, 4); var number = webCrawler.Extract(issueUrl, @">(\d+).pdf<"); var fileName = String.Format(@"{0}\{1}_{2}.pdf", directory, date.Replace('-', '_'), number); Console.WriteLine("{0} => {1}", pdfUrl, fileName); webCrawler.AddFile(pdfUrl, fileName); } } // next page url = webCrawler.Extract(url, @"href=""(.+?)"">next"); if (String.IsNullOrEmpty(url)) { break; } url = baseUrl + url.XmlUnescape(); } } }
private void Run(string[] args) { parseArguments (args); if (show_help || urls.Count <= 0) { showHelp(); Environment.Exit(1); } // run the Crawler on all provided URLs foreach (string url in urls) { WebCrawler crawl = new WebCrawler (url, depth, cross_domain, debug); crawl.RunAsync (200); // run the crawler in async mode (with 20 threads) } Environment.Exit(0); }
public CrawlerReport AnalyseSite(dynamic site) { var startUrl = new Uri(site.Url); var crawlerSettings = BuildCrawlerSettings(startUrl); var crawler = new WebCrawler(crawlerSettings); try { crawler.Start(); while (crawler.IsRunning) { Thread.Sleep(500); } crawler.Report.Save(BuildReportPath()); } catch (Exception ex) { //logging needs added here throw new ApplicationException(string.Format("Error Crawling {0}", startUrl)); } return crawler.Report; }
public void DoJobTest() { var crawler = new WebCrawler {InitLink = "http://dantri.com.vn"}; crawler.DoJob(); }
CrawlerReport RunAnalysis(Uri startUrl) { var settings = new CrawlerSettings(startUrl); settings.UseUserAgentForRobots = true; settings.ExternalLinkCriteria = ExternalLinkCriteria.SameFolderAndDeeper; // Generate a unique name var name = settings.Name = "SEOREPORT" + DateTime.Now.ToString("yy-MM-dd hh-mm-ss"); // Use the same directory as the default used by the UI var path = Path.Combine( Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), "IIS SEO Reports"); settings.IgnoreRobots = true; settings.IgnoreNoIndex = true; settings.IgnoreNoFollow = true; settings.Timeout = 200000; settings.MaximumLinkCount = MaxPages; settings.DirectoryCache = Path.Combine(path, settings.Name); // Create a new crawler and start running var crawler = new WebCrawler(settings); crawler.Start(); while (crawler.IsRunning) { Thread.Sleep(2000); log.LogMessage("{0,9:N0} - {1,9:N0} - {2,9:N2} MB", crawler.Report.GetUrlCount(), crawler.RemainingUrls, crawler.BytesDownloaded/1048576.0f); } crawler.Report.Save(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments) + "/IIS Seo Reports/"); return crawler.Report; }
public void Init() { _clock.FormattedCurrentTime().Returns(ArbitraryFormattedCurrentTime); _clock.FormattedCurrentTime().Returns(ArbitraryFormattedCurrentTime); _crawler = new WebCrawler(_crawlingStats, _resultWriter, _clock); }
void crawler_CrawlStarted(object sender, WebCrawler.Event.CrawlStartedEventArgs e) { //throw new NotImplementedException(); }
void crawler_CrawlAnnounced(object sender, WebCrawler.Event.CrawlAnnouncedEventArgs e) { this.Dispatcher.BeginInvoke(new UpdateUserInterface(OnCrawlAnnounced), e); }