/// <summary> /// Starts spider to get the list of all relevant urls /// Than analyses pages recognized as a record page /// </summary> /// <param name="firstUrl"></param> private void startSpider(string firstUrl) { WebSpider spider = new WebSpider(firstUrl, REQUIRED_URL_BODY, NUMBER_OF_PAGES_LIMIT); spider.Execute(); StringBuilder builder = new StringBuilder(); int i = 0; foreach (DictionaryEntry entry in spider.WebPages) { var page = ((System.Uri)entry.Key).ToString(); if (!page.Contains(REQUIRED_URL_BODY_WITHOUT)) { continue; } if (page.Contains(RECORD_PAGE_IDENTIFIER)) { analyzeContent(page); } builder.AppendLine(page); i++; if (i % 10 == 0) { Console.Out.WriteLine(string.Format("Progress {0} %", Math.Round((i / (double)spider.WebPages.Count * 100)), 2)); } } File.WriteAllText(outputDirectory + LOG_NAME, builder.ToString()); Console.Out.WriteLine("Processed urls: " + collectionIndex); }
public Page FetchPage(string template, string url) { WebSpider spider = new WebSpider(); var page = Page.GetFromFile(template); page.Value = url; spider.FetchData(page); ValidatePages(spider.Pages); return(spider.Pages.Find(x => x.Value.Equals(url))); }
static void LaunchSpider(string source, int max_count = 5000) { WebSpider spider = new WebSpider(); spider.SetSourcePage(source); while (spider.CountToVisit > 0 && spider.Count < max_count) { WEBPAGE page = spider.SingleStep(); } }
public override int Run() { Uri uri; url = url.StartsWith("http://", StringComparison.OrdinalIgnoreCase) || url.StartsWith("https://", StringComparison.OrdinalIgnoreCase) ? url : string.Format(CultureInfo.InvariantCulture, "http://{0}", url); if (!Uri.TryCreate(url, UriKind.Absolute, out uri)) { Logger.Error(CultureInfo.CurrentUICulture, Resources.ParseUrlRunCantCreateUriError); return(-1); } int count = -1; if (!string.IsNullOrEmpty(number) && (!Int32.TryParse(number, out count) || count < 1)) { Logger.Error(Resources.ParseUrlRunNotIntegerError, number); return(-1); } if (!string.IsNullOrEmpty(outputFileFormat) && FileFormat == Report.OutputFileFormat.None) { Logger.Error(Resources.ParseUrlRunUnsupportedFormatError, outputFileFormat); return(-1); } if (string.IsNullOrEmpty(username) != string.IsNullOrEmpty(password)) { Logger.Error("Username and Password should be both set."); return(-1); } WebSpiderOptions options = new WebSpiderOptions { UriProcessedCountMax = count, ShowSuccessUrls = !errorsOnly, Username = username, Password = password, Domain = domain }; Report report = new WebSpider(uri, options).Execute(); if (FileFormat != Report.OutputFileFormat.None) { report.SaveReport(FileFormat, htmlTemplate); } return(0); }
public void StartForm(string formName) { switch (formName) { case "WebSpider": this.Invoke(new Action(() => { if (_formWebSpider == null) { _formWebSpider = new WebSpider(); _formWebSpider.MdiParent = this; _formWebSpider.Show(); } else { if (_formWebSpider.IsDisposed) { _formWebSpider = new WebSpider(); _formWebSpider.MdiParent = this; _formWebSpider.Show(); } else { _formWebSpider.Activate(); } } })); break; case "ScreenConsole": this.Invoke(new Action(() => { if (_formScreenConsole == null) { _formScreenConsole = new ScreenConsole(); _formScreenConsole.MdiParent = this; _formScreenConsole.Show(); } else { if (_formScreenConsole.IsDisposed) { _formScreenConsole = new ScreenConsole(); _formScreenConsole.MdiParent = this; _formScreenConsole.Show(); } else { _formScreenConsole.Activate(); } } })); break; } }
public RunSpider(string uri, string baseUri, int maxUri) { Spider = new WebSpider(uri, baseUri, maxUri); Spider.Execute( ); ICollection webPages = Spider.WebPages.Values; Pages = new WebPageState[webPages.Count]; int index = 0; foreach (WebPageState webPage in webPages) { Pages[index++] = webPage; } }
public List <Page> FetchPages(Dictionary <string, string> para) { WebSpider spider = new WebSpider(); var pages = para.Select( x => { var page = Page.GetFromFile(x.Value); page.Value = x.Key; return(page); }).ToList(); foreach (var page in pages) { spider.FetchData(page); } ValidatePages(spider.Pages); return(spider.Pages); }
public void WebSpider_Constructor_PagesNotNull() { WebSpider spider = new WebSpider(); Assert.IsNotNull(spider.Pages); }