public static void Run() { //NCrawlerModule.Setup(); //// Setup crawler to crawl http://ncrawler.codeplex.com //// with 1 thread adhering to robot rules, and maximum depth //// of 2 with 4 pipeline steps: //// * Step 1 - The Html Processor, parses and extracts links, text and more from html //// * Step 2 - Processes PDF files, extracting text //// * Step 3 - Try to determine language based on page, based on text extraction, using google language detection //// * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class using (Crawler c = new Crawler(new Uri("http://bidvportal.vn/"), new HtmlDocumentProcessor(), // Process html //new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction new GoogleLanguageDetection(), // Add language detection new Mp3FileProcessor(), // Add language detection new BIDVObjectDumperStep()) { // Custom step to visualize crawl MaximumThreadCount = 10, MaximumCrawlDepth = 2, //ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
public static void Run() { NCrawlerModule.Setup(); Console.Out.WriteLine("Simple crawl demo"); // Setup crawler to crawl http://ncrawler.codeplex.com // with 1 thread adhering to robot rules, and maximum depth // of 2 with 4 pipeline steps: // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Processes PDF files, extracting text // * Step 3 - Try to determine language based on page, based on text extraction, using google language detection // * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class using (Crawler c = new Crawler(new Uri("http://gre.magoosh.com/questions/5"), new HtmlDocumentProcessor(), // Process html new GoogleLanguageDetection(), new DumperStep()) { // Custom step to visualize crawl MaximumThreadCount = 1, MaximumCrawlDepth = 1, ExcludeFilter = Program.ExtensionsToSkip, }) { // Begin crawl c.Crawl(); } }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; string cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } TextExtraction t = new TextExtraction(); lock (this) { BIDVObject item = new BIDVObject(); //item.Id = Guid.NewGuid(); //item.Url = propertyBag.Step.Uri.ToString(); //if (item.Url.StartsWith("http://bidvportal.vn/eDocman")) //{ // item.Title = propertyBag.Title; // string strTarget = t.GetMinimumString(propertyBag.Text, "Chi tiết văn bản", "Nội dung văn bản"); // item.Text = strTarget; // string strNgayPhatHanh = t.GetMinimumString(strTarget, "Ngày phát hành", "Số đi"); // item.NgayPhatHanh = strNgayPhatHanh.Replace(' ','/'); // string strSubject = t.GetMinimumString(strTarget, "Trích yếu", "Độ khẩn"); // //item.Subject = strSubject; // //item.ContentEncoding = propertyBag.ContentEncoding; // //item.ContentType = propertyBag.ContentType; // //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length; // item.Depth = propertyBag.Step.Depth; // //item.CultureDisplayValue = cultureDisplayValue; // string[] strSplit = { "/" }; // int day = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[0]); // int month = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]); // int year = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]); // if ((DateTime.Now.Year == year) && (DateTime.Now.Month == month) && (DateTime.Now.Day == day)) // { // //db.AddToItems(item); // } //} } try { db.SaveChanges(); } catch (Exception ex) { Console.WriteLine("====================================================="); Console.WriteLine(ex.Message); } }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; string cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } TextExtraction t = new TextExtraction(); lock (this) { BIDVObject item = new BIDVObject(); item.OriginalUrl = propertyBag.Step.Uri.ToString(); if (!IsDuplicate(item.OriginalUrl)) { item.Title = propertyBag.Title; item.StatusDescription = propertyBag.StatusDescription; item.ResponseUri = propertyBag.ResponseUri.ToString(); item.Text = propertyBag.Text; item.Depth = propertyBag.Step.Depth; item.LastModified = propertyBag.LastModified; item.OriginalReferrerUrl = propertyBag.OriginalReferrerUrl.ToString(); item.Server = propertyBag.Server; string description = t.GetBetween2Words("Chi tiết văn bản", "Xem toàn màn hình", item.Text.Replace("\r"," ").Replace("\n"," ")); item.Summary = t.RemoveWhiteSpace(description); string strNgayPhatHanh = t.GetBetween2Words("Ngày phát hành", "Số đi", item.Summary); strNgayPhatHanh = strNgayPhatHanh.Replace(' ', '/').Remove(0, ("Ngày phát hành").Length); string[] strSplit = { "/" }; int day = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]); int month = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]); int year = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[3]); //Clean the text field is null item.Text = null; item.IsToEmail = false; db.AddToBIDVObjects(item); item.ContentEncoding = propertyBag.ContentEncoding; item.ContentType = propertyBag.ContentType; //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length; //item.CultureDisplayValue = cultureDisplayValue; } } try { db.SaveChanges(); } catch (Exception ex) { throw new Exception(ex.Message); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { ITemplate template = SelectTemplate(propertyBag); if(template!=null) { XmlDocument resut= template.Parse(propertyBag["HtmlDoc"].Value as HtmlAgilityPack.HtmlDocument); MongoDBSaver saver = new MongoDBSaver(); saver.Save(propertyBag, resut); } }
public static void Run() { Console.Out.WriteLine("Simple crawl demo using local database a storage"); var targetToCrawl = ConfigurationManager.AppSettings["CrawlTargetUrl"]; var maximumThreadCount = int.Parse(ConfigurationManager.AppSettings["MaximumThreadCount"]); var maximumCrawlDepth = int.Parse(ConfigurationManager.AppSettings["MaximumCrawlDepth"]); // Setup crawler to crawl http://ncrawler.codeplex.com // with 1 thread adhering to robot rules, and maximum depth // of 2 with 4 pipeline steps: // * Step 1 - The Html Processor, parses and extracts links, text and more from html // * Step 2 - Processes PDF files, extracting text // * Step 3 - Try to determine language based on page, based on text extraction, using google language detection // * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class DbServicesModule.Setup(true); using (Crawler c = new Crawler(new Uri(targetToCrawl), new WholeHtmlProcessor(), // Process html new DumperStep()) { // Custom step to visualize crawl MaximumThreadCount = maximumThreadCount, MaximumCrawlDepth = maximumCrawlDepth, ExcludeFilter = Program.ExtensionsToSkip, }) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { if (e.CrawlQueue.Any()) { var uri = new Uri(targetToCrawl); var groupId = uri.GetHashCode(); Console.Out.WriteLine("GroupId=" + groupId); e.ExecuteStoreCommand("Update CrawlQueue set Exclusion='false' where GroupId={0} and Exclusion='true'", groupId); //var exclusion = e.CrawlQueue.Where(m => m.Exclusion && m.GroupId == groupId).ToList(); //if (exclusion.Any()) //{ // Console.Out.WriteLine("Count with Exclusion=" + exclusion.Count); // exclusion.ForEach(m => m.Exclusion = false); //} ////foreach (var crawlQueue in e.CrawlQueue) ////{ //// crawlQueue.Exclusion = false; ////} //e.SaveChanges(); } }); // Begin crawl Console.Out.WriteLine(" Begin crawl"); c.Crawl(); } }
public void Crawl() { using (Crawler c = new Crawler(new Uri(this.WebsiteUrl), new HtmlDocumentProcessor(), new DocumentIndexStep(this.Config, this.LogWrapper))) { this.LogWrapper.Info("Crawler started: Using " + (System.Environment.ProcessorCount * 2) + " threads"); c.AdhereToRobotRules = true; c.MaximumThreadCount = System.Environment.ProcessorCount * 2; c.ExcludeFilter = new[] { new NCrawler.Services.RegexFilter(new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico)")) }; c.Crawl(); } }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; string cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } TextExtraction t = new TextExtraction(); lock (this) { ASPNETObject item = new ASPNETObject(); item.OriginalUrl = propertyBag.Step.Uri.ToString(); if (!IsDuplicate(item.OriginalUrl)) { item.Title = propertyBag.Title; item.StatusDescription = propertyBag.StatusDescription; item.ResponseUri = propertyBag.ResponseUri.ToString(); item.Text = null; item.Depth = propertyBag.Step.Depth; item.LastModified = propertyBag.LastModified; item.OriginalReferrerUrl = propertyBag.OriginalReferrerUrl.ToString(); item.Server = propertyBag.Server; //Clean the text field is null db.AddToASPNETObjects(item); item.ContentEncoding = propertyBag.ContentEncoding; item.ContentType = propertyBag.ContentType; item.IsToEmail = false; item.Summary = propertyBag.Title; //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length; //item.CultureDisplayValue = cultureDisplayValue; } } try { db.SaveChanges(); } catch (Exception ex) { throw new Exception(ex.Message); } }
private void DO(CrawlerInfo ci) { var uri = new Uri(ci.url.Url); var siteType = HtmlParse.RecogSite(uri); var c = new NCrawler.Crawler(uri, new HtmlDocumentProcessor(), new MyPipelineStep(ci)) { MaximumCrawlDepth = CrawlArgs.CrawlDepth(siteType), MaximumThreadCount = 5, IncludeFilter = CrawlArgs.IncludeFilter(siteType), ExcludeFilter = CrawlArgs.ExcludeFilter(siteType), }; c.Crawl(); }
public String Crawl(int maxThreadCount = 4, int maxCrawlDepth = 100) { //Use an in-memory setup NCrawlerModule.Setup(); //Always push the html document processor onto the pipeline this.pipelineSteps.Insert(0, new HtmlDocumentProcessor()); using (Crawler c = new Crawler(this.root,this.pipelineSteps.ToArray<NCrawler.Interfaces.IPipelineStep>()) { MaximumThreadCount = maxThreadCount, MaximumCrawlDepth = maxCrawlDepth, AdhereToRobotRules = false, }) { c.BeforeDownload += new EventHandler<NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload); c.AfterDownload += new EventHandler<NCrawler.Events.AfterDownloadEventArgs>(c_AfterDownload); c.DownloadProgress += new EventHandler<NCrawler.Events.DownloadProgressEventArgs>(c_DownloadProgress); c.Crawl(); } return this.siteHash; }
private static void Main(string[] args) { if (args == null || args.Length == 0) { arguments.ShowUsageShort(); } else { arguments.m_StartupArgumentOptionSet.Parse(args); using (Crawler crawler = new Crawler(new Uri("http://ncrawler.codeplex.com"), new HtmlDocumentProcessor(), new ConsolePipelineStep())) { crawler.MaximumThreadCount = 10; crawler.Cancelled += crawler_Cancelled; crawler.DownloadException += crawler_DownloadException; crawler.DownloadProgress += crawler_DownloadProgress; crawler.PipelineException += crawler_PipelineException; crawler.Crawl(); } } }
public void Process(NCrawler.Crawler crawler, PropertyBag propertyBag) { var rsp = propertyBag.GetResponse(); try { HtmlDocument htmlDoc = HtmlParse.LoadFromHtml(propertyBag); var siteType = HtmlParse.RecogSite(propertyBag.ResponseUri); var records = Parse(htmlDoc, siteType); if (records == null) { return; } foreach (var record in records) { DAL.Data.Add(record); ++ci.Count; } } catch (NullReferenceException) { } }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (!bindevents) { crawler.CrawlFinished += new EventHandler<CrawlFinishedEventArgs>(crawler_CrawlFinished); bindevents = true; } string id = config.GetDocumentPath(propertyBag.Step.Uri); if (propertyBag.StatusCode == System.Net.HttpStatusCode.OK) { repository.AddUpdate(id, propertyBag.Title, propertyBag.Text, propertyBag.LastModified); log.Info("Add/Update [" + id + "]"); } else if (propertyBag.StatusCode == System.Net.HttpStatusCode.NotFound) { log.Warning("Crawler encoutered 404 for [" + id + "]"); repository.Delete(id); } else { log.Warning(string.Format("Crawler encountered status {0} - {4} ({1}) for document {2} - {3}", propertyBag.StatusCode.ToString(), propertyBag.StatusDescription, id, propertyBag.Step.Uri, ((int)propertyBag.StatusCode).ToString())); } }
/// <summary> /// </summary> /// <param name = "crawler"> /// The crawler. /// </param> /// <param name = "propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; string cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } lock (this) { Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri); Console.Out.WriteLine(ConsoleColor.Blue, "stuff -> " + propertyBag.Text); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}", propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", Thread.CurrentThread.ManagedThreadId); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse); Console.Out.WriteLine(); } }
public void CrawlerSetting(Crawler c) { c.MaximumThreadCount = 2; c.MaximumCrawlDepth = 1; }
private void CreateCrawler() { ServicePointManager.MaxServicePoints = 999999; ServicePointManager.DefaultConnectionLimit = 999999; ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls; ServicePointManager.CheckCertificateRevocationList = true; ServicePointManager.EnableDnsRoundRobin = true; var echo = new EchoStep(); echo.OneWorkFinished += ShowText; _crawler = new Crawler( new Uri("http://www.cnblogs.com"), new HtmlDocumentProcessor(), echo ) { MaximumThreadCount = 1, MaximumCrawlDepth = 3, ExcludeFilter = new[] { new RegexFilter( new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico)", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase)) }, }; }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public void Process(Crawler crawler, PropertyBag propertyBag) { //CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; //string cultureDisplayValue = "N/A"; //if (!contentCulture.IsNull()) //{ // cultureDisplayValue = contentCulture.DisplayName; //} lock (this) { //EchoControl.Invoke(new ShowTitleDelegate(ShowTitle), propertyBag.Title); InvokeOneWorkFinished(propertyBag.Title); //Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}", propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", System.Threading.Thread.CurrentThread.ManagedThreadId); //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse); //Console.Out.WriteLine(); } }
public void RunCrawl(Crawler c) { c.Crawl(); }
public void Process(Crawler crawler, PropertyBag propertyBag) { HttpContext.Current.Response.Write("<br>FindUrl:" + propertyBag.Step.Uri); }
public void Crawl() { using (Crawler c = new Crawler(new Uri(this.WebsiteUrl), new HtmlDocumentProcessor(FilterTextRules, FilterLinksRules), new DocumentIndexStep(this.Config, this.LogWrapper))) { this.LogWrapper.Info("Crawler started: Using " + MaximumThreadCount + " threads"); c.AdhereToRobotRules = AdhereToRobotRules; c.MaximumThreadCount = MaximumThreadCount; c.ExcludeFilter = ExcludeFilter; c.UriSensitivity = UriSensitivity; c.MaximumCrawlDepth = MaximumCrawlDepth; c.Crawl(); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); string stepUri = Uri.UnescapeDataString(propertyBag.Step.Uri.AbsoluteUri); if (stepUri.Length > 396) { stepUri = stepUri.Substring(0, 396); } var crawlHistory = AspectF.Define. Return<CrawlHistory, NCrawlerEntitiesDbServices>( e => e.CrawlHistory.Where(m => m.Key == stepUri).FirstOrDefault()); if (crawlHistory == null) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", stepUri); }); return; } try { if (propertyBag.StatusCode != HttpStatusCode.OK) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key); //if (!result.IsNull()) //{ // e.DeleteObject(result); // e.SaveChanges(); //} }); return; } if (!IsHtmlContent(propertyBag.ContentType)) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key); //if (!result.IsNull()) //{ // e.DeleteObject(result); // e.SaveChanges(); //} }); return; } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (Stream reader = propertyBag.GetResponse()) { Encoding documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } //string content = reader.ReadToEnd(); //resultHtmlContent = content; } //string steplUri = propertyBag.ResponseUri.OriginalString; string orginalHtmlContent = htmlDoc.DocumentNode.OuterHtml; string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); DocumentWithLinks links = htmlDoc.GetLinks(); //string urlRegex = @"^http://www.bbc.co.uk/food/recipes/[^#/]+$"; List<string> recipeRegex = null; var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string; if (jsonStr == null) { using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8)) { jsonStr = stream.ReadToEnd(); var policy = new CacheItemPolicy(); policy.Priority = CacheItemPriority.NotRemovable; policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1); cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy); Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite")); } } var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr); if (json.RecipeRegex != null && json.RecipeRegex.Count > 0) { recipeRegex = json.RecipeRegex; } bool needToStore = false; if (recipeRegex != null) { foreach (var regex in recipeRegex) { if (Regex.IsMatch(propertyBag.Step.Uri.AbsoluteUri, regex, RegexOptions.IgnoreCase)) { needToStore = true; break; } } } else { needToStore = true; } if (needToStore) { //string folderPath = "D:/CrawlerManager/CrawlerData"; //string instanceFolderPath = folderPath + "/" + crawlHistory.GroupId; //string path = folderPath + "/" + crawlHistory.GroupId + "/" + string.Format("{0}.txt", crawlHistory.Id); //if (!Directory.Exists(folderPath)) //{ // Directory.CreateDirectory(folderPath); //} //if (!Directory.Exists(instanceFolderPath)) //{ // Directory.CreateDirectory(instanceFolderPath); //} //if (!File.Exists(path)) //{ // try // { // using (StreamWriter sw = File.CreateText(path)) // { // sw.WriteLine(orginalHtmlContent); // } // } // catch (Exception ex) // { // log4net.Config.XmlConfigurator.Configure(); // log4net.ILog log = log4net.LogManager.GetLogger("logger-name"); // log.Error(ex); // } //} var folderHelper = new FolderHelper(); var path = folderHelper.GetFolderPathToStore(crawlHistory.GroupId) + "/" + string.Format("{0}.txt", crawlHistory.Id); Console.Write(path); if (!File.Exists(path)) { try { using (StreamWriter sw = File.CreateText(path)) { sw.WriteLine(orginalHtmlContent); } } catch (Exception ex) { log4net.Config.XmlConfigurator.Configure(); log4net.ILog log = log4net.LogManager.GetLogger("logger-name"); log.Error(ex); } } //} } AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); }); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty() || link.Length > 396) { continue; } string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = ""; try { normalizedLink = NormalizeLink(baseUrl, decodedLink); } catch (Exception ex) { continue; } if (normalizedLink.IsNullOrEmpty()) { continue; } if (link.Contains("page=")) { var a = 1; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary<string, object> { {Resources.PropertyBagKeyOriginalUrl, link}, {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri} }); } } catch (Exception ex) { AspectF.Define.Do<NCrawlerEntitiesDbServices>(e => { e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key); }); log4net.Config.XmlConfigurator.Configure(); log4net.ILog log = log4net.LogManager.GetLogger("logger-name"); log.Error(ex); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.Step.Uri.PathAndQuery.ToLower().Contains("project")) { string a = ""; } if (!string.IsNullOrEmpty(propertyBag.Text) && !PreviouslyIndexed(propertyBag.Step.Uri.ToString())) { Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); //add string properties Lucene.Net.Documents.Field fldURL = new Lucene.Net.Documents.Field("url", propertyBag.Step.Uri.ToString(), Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES); doc.Add(fldURL); Lucene.Net.Documents.Field fldContent = new Lucene.Net.Documents.Field("content", propertyBag.Text, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES); doc.Add(fldContent); Lucene.Net.Documents.Field fldTitle = new Lucene.Net.Documents.Field("title", propertyBag.Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES); doc.Add(fldTitle); //write the document to the index indexWriter.AddDocument(doc); } }
public static void SpiderThread() { //state the file location of the index Lucene.Net.Store.Directory dir = Lucene.Net.Store.FSDirectory.GetDirectory(indexDir, true); //create an analyzer to process the text Lucene.Net.Analysis.Analyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(); //create the index writer with the directory and analyzer defined. Lucene.Net.Index.IndexWriter indexWriter = new Lucene.Net.Index.IndexWriter(dir, analyzer, true); using ( Crawler c = new Crawler ( new Uri(url), new HtmlDocumentProcessor ( new Dictionary<string, string> { {"<!--BeginTextFiler-->", "<!--EndTextFiler-->"} }, new Dictionary<string, string> { {"<head", "</head>"}//skip any links in the head } ), new DumperStep(indexWriter) ) { // Custom step to visualize crawl MaximumThreadCount = threads, MaximumCrawlDepth = 10, ExcludeFilter = new[] { new RegexFilter( new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico|\.axd)", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase) ) }, } ) { // Begin crawl c.Crawl(); } //optimize and close the writer indexWriter.Optimize(); indexWriter.Close(); }
public void Process(Crawler crawler, PropertyBag propertyBag) { Console.Out.WriteLine(propertyBag.Step.Uri); }
/// <summary> /// The run. /// </summary> public static void Run() { ServicePointManager.MaxServicePoints = 999999; ServicePointManager.DefaultConnectionLimit = 999999; ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls; ServicePointManager.CheckCertificateRevocationList = true; ServicePointManager.EnableDnsRoundRobin = true; IFilter[] ExtensionsToSkip = new IFilter[2]; ExtensionsToSkip[0] = new Filter(); ExtensionsToSkip[1] = (RegexFilter) new Regex( @"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico)", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase); DbServicesModule.Setup(true); NCrawlerModule.Register( builder => builder.Register( c=>new Log4NetLogService()).As<NCrawler.Interfaces.ILog>().InstancePerDependency()); using ( var c = new Crawler( new Uri("http://www.cnblogs.com"), new HtmlDocumentProcessor(), new ParseByTemplate() ) { // Custom step to visualize crawl MaximumThreadCount = 1, MaximumCrawlDepth = 50, ExcludeFilter = ExtensionsToSkip, ConnectionTimeout=new TimeSpan(0,1,0), }) { c.AfterDownload += CrawlerAfterDownload; c.PipelineException += CrawlerPipelineException; c.DownloadException += CrawlerDownloadException; c.Crawl(); } }
public NCrawler.Crawler GetCrawler(string Absoluteurl) { Crawler c = null; c = new Crawler(new Uri(Absoluteurl, UriKind.Absolute), new HtmlDocumentProcessor(), new DumpResult()); return c; }