Пример #1
0
        public static void Run()
        {
            //NCrawlerModule.Setup();

            //// Setup crawler to crawl http://ncrawler.codeplex.com
            //// with 1 thread adhering to robot rules, and maximum depth
            //// of 2 with 4 pipeline steps:
            ////	* Step 1 - The Html Processor, parses and extracts links, text and more from html
            ////  * Step 2 - Processes PDF files, extracting text
            ////  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
            ////  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class

            using (Crawler c = new Crawler(new Uri("http://bidvportal.vn/"),
                new HtmlDocumentProcessor(), // Process html
                //new iTextSharpPdfProcessor.iTextSharpPdfProcessor(), // Add PDF text extraction
                new GoogleLanguageDetection(), // Add language detection
                new Mp3FileProcessor(), // Add language detection
                new BIDVObjectDumperStep())
            {
                // Custom step to visualize crawl
                MaximumThreadCount = 10,
                MaximumCrawlDepth = 2,
                //ExcludeFilter = Program.ExtensionsToSkip,
            })
            {
                // Begin crawl
                c.Crawl();
            }
        }
Пример #2
0
        public static void Run()
        {
            NCrawlerModule.Setup();
            Console.Out.WriteLine("Simple crawl demo");

            // Setup crawler to crawl http://ncrawler.codeplex.com
            // with 1 thread adhering to robot rules, and maximum depth
            // of 2 with 4 pipeline steps:
            //	* Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Processes PDF files, extracting text
            //  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
            //  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class
            using (Crawler c = new Crawler(new Uri("http://gre.magoosh.com/questions/5"),
                new HtmlDocumentProcessor(), // Process html
                new GoogleLanguageDetection(),
                new DumperStep())
            {
                // Custom step to visualize crawl
                MaximumThreadCount = 1,
                MaximumCrawlDepth = 1,
                ExcludeFilter = Program.ExtensionsToSkip,
            })
            {
                // Begin crawl
                c.Crawl();

            }
        }
Пример #3
0
        /// <summary>
        /// </summary>
        /// <param name="crawler">
        /// The crawler.
        /// </param>
        /// <param name="propertyBag">
        /// The property bag.
        /// </param>
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
            string cultureDisplayValue = "N/A";
            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            TextExtraction t = new TextExtraction();

            lock (this)
            {
                BIDVObject item = new BIDVObject();
                //item.Id = Guid.NewGuid();
                //item.Url = propertyBag.Step.Uri.ToString();

                //if (item.Url.StartsWith("http://bidvportal.vn/eDocman"))
                //{
                //    item.Title = propertyBag.Title;

                //    string strTarget = t.GetMinimumString(propertyBag.Text, "Chi tiết văn bản", "Nội dung văn bản");
                //    item.Text = strTarget;

                //    string strNgayPhatHanh = t.GetMinimumString(strTarget, "Ngày phát hành", "Số đi");
                //    item.NgayPhatHanh = strNgayPhatHanh.Replace(' ','/');

                //    string strSubject = t.GetMinimumString(strTarget, "Trích yếu", "Độ khẩn");
                //    //item.Subject = strSubject;

                //    //item.ContentEncoding = propertyBag.ContentEncoding;
                //    //item.ContentType = propertyBag.ContentType;
                //    //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length;
                //    item.Depth = propertyBag.Step.Depth;
                //    //item.CultureDisplayValue = cultureDisplayValue;

                //    string[] strSplit = { "/" };
                //    int day = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[0]);
                //    int month = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]);
                //    int year = int.Parse(item.NgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]);

                //    if ((DateTime.Now.Year == year) && (DateTime.Now.Month == month) && (DateTime.Now.Day == day))
                //    {
                //        //db.AddToItems(item);
                //    }
                //}
            }

            try
            {
                db.SaveChanges();
            }
            catch (Exception ex)
            {
                Console.WriteLine("=====================================================");
                Console.WriteLine(ex.Message);
            }
        }
Пример #4
0
        /// <summary>
        /// </summary>
        /// <param name="crawler">
        /// The crawler.
        /// </param>
        /// <param name="propertyBag">
        /// The property bag.
        /// </param>
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
            string cultureDisplayValue = "N/A";
            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            TextExtraction t = new TextExtraction();

            lock (this)
            {
                BIDVObject item = new BIDVObject();
                item.OriginalUrl = propertyBag.Step.Uri.ToString();

                if (!IsDuplicate(item.OriginalUrl))
                {
                    item.Title = propertyBag.Title;
                    item.StatusDescription = propertyBag.StatusDescription;
                    item.ResponseUri = propertyBag.ResponseUri.ToString();
                    item.Text = propertyBag.Text;
                    item.Depth = propertyBag.Step.Depth;
                    item.LastModified = propertyBag.LastModified;
                    item.OriginalReferrerUrl = propertyBag.OriginalReferrerUrl.ToString();
                    item.Server = propertyBag.Server;
                    string description = t.GetBetween2Words("Chi tiết văn bản", "Xem toàn màn hình", item.Text.Replace("\r","  ").Replace("\n","  "));
                    item.Summary = t.RemoveWhiteSpace(description);

                    string strNgayPhatHanh = t.GetBetween2Words("Ngày phát hành", "Số đi", item.Summary);
                    strNgayPhatHanh = strNgayPhatHanh.Replace(' ', '/').Remove(0, ("Ngày phát hành").Length);
                    string[] strSplit = { "/" };
                    int day = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[1]);
                    int month = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[2]);
                    int year = int.Parse(strNgayPhatHanh.Split(strSplit, StringSplitOptions.None)[3]);

                    //Clean the text field is null
                    item.Text = null;
                    item.IsToEmail = false;

                    db.AddToBIDVObjects(item);

                    item.ContentEncoding = propertyBag.ContentEncoding;
                    item.ContentType = propertyBag.ContentType;
                    //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length;
                    //item.CultureDisplayValue = cultureDisplayValue;
                }
            }

            try
            {
                db.SaveChanges();
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }
        }
Пример #5
0
 public void Process(Crawler crawler, PropertyBag propertyBag)
 {
     ITemplate template = SelectTemplate(propertyBag);
     if(template!=null)
     {
      XmlDocument resut=   template.Parse(propertyBag["HtmlDoc"].Value as HtmlAgilityPack.HtmlDocument);
         MongoDBSaver saver = new MongoDBSaver();
         saver.Save(propertyBag, resut);
     }
 }
Пример #6
0
        public static void Run()
        {
            Console.Out.WriteLine("Simple crawl demo using local database a storage");

            var targetToCrawl = ConfigurationManager.AppSettings["CrawlTargetUrl"];
            var maximumThreadCount = int.Parse(ConfigurationManager.AppSettings["MaximumThreadCount"]);
            var maximumCrawlDepth = int.Parse(ConfigurationManager.AppSettings["MaximumCrawlDepth"]);

            // Setup crawler to crawl http://ncrawler.codeplex.com
            // with 1 thread adhering to robot rules, and maximum depth
            // of 2 with 4 pipeline steps:
            //	* Step 1 - The Html Processor, parses and extracts links, text and more from html
            //  * Step 2 - Processes PDF files, extracting text
            //  * Step 3 - Try to determine language based on page, based on text extraction, using google language detection
            //  * Step 4 - Dump the information to the console, this is a custom step, see the DumperStep class
            DbServicesModule.Setup(true);
            using (Crawler c = new Crawler(new Uri(targetToCrawl),
                new WholeHtmlProcessor(), // Process html
                new DumperStep())
            {
                // Custom step to visualize crawl
                MaximumThreadCount = maximumThreadCount,
                MaximumCrawlDepth = maximumCrawlDepth,
                ExcludeFilter = Program.ExtensionsToSkip,
            })
            {
                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    if (e.CrawlQueue.Any())
                    {
                        var uri = new Uri(targetToCrawl);
                        
                        var groupId = uri.GetHashCode();
                        Console.Out.WriteLine("GroupId=" + groupId);
                        e.ExecuteStoreCommand("Update CrawlQueue set Exclusion='false' where GroupId={0} and Exclusion='true'", groupId);
                        //var exclusion = e.CrawlQueue.Where(m => m.Exclusion && m.GroupId == groupId).ToList();
                        //if (exclusion.Any())
                        //{
                        //    Console.Out.WriteLine("Count with Exclusion=" + exclusion.Count);
                        //    exclusion.ForEach(m => m.Exclusion = false);
                        //}
                        ////foreach (var crawlQueue in e.CrawlQueue)
                        ////{
                        ////    crawlQueue.Exclusion = false;
                        ////}
                        //e.SaveChanges();
                    }
                });
                // Begin crawl
                Console.Out.WriteLine(" Begin crawl");
                c.Crawl();
            }
        }
Пример #7
0
        public void Crawl()
        {
            using (Crawler c = new Crawler(new Uri(this.WebsiteUrl), new HtmlDocumentProcessor(), new DocumentIndexStep(this.Config, this.LogWrapper)))
            {
                this.LogWrapper.Info("Crawler started: Using " + (System.Environment.ProcessorCount * 2) + " threads");

                c.AdhereToRobotRules = true;
                c.MaximumThreadCount = System.Environment.ProcessorCount * 2;
                c.ExcludeFilter = new[] {
                    new NCrawler.Services.RegexFilter(new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico)"))
                };
                c.Crawl();
            }
        }
Пример #8
0
        /// <summary>
        /// </summary>
        /// <param name="crawler">
        /// The crawler.
        /// </param>
        /// <param name="propertyBag">
        /// The property bag.
        /// </param>
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
            string cultureDisplayValue = "N/A";
            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            TextExtraction t = new TextExtraction();

            lock (this)
            {
                ASPNETObject item = new ASPNETObject();
                item.OriginalUrl = propertyBag.Step.Uri.ToString();

                if (!IsDuplicate(item.OriginalUrl))
                {
                    item.Title = propertyBag.Title;
                    item.StatusDescription = propertyBag.StatusDescription;
                    item.ResponseUri = propertyBag.ResponseUri.ToString();
                    item.Text = null;
                    item.Depth = propertyBag.Step.Depth;
                    item.LastModified = propertyBag.LastModified;
                    item.OriginalReferrerUrl = propertyBag.OriginalReferrerUrl.ToString();
                    item.Server = propertyBag.Server;
                    //Clean the text field is null
                    db.AddToASPNETObjects(item);
                    item.ContentEncoding = propertyBag.ContentEncoding;
                    item.ContentType = propertyBag.ContentType;
                    item.IsToEmail = false;
                    item.Summary = propertyBag.Title;

                    //item.Length = propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length;

                    //item.CultureDisplayValue = cultureDisplayValue;

                }
            }

            try
            {
                db.SaveChanges();
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }
        }
        private void DO(CrawlerInfo ci)
        {
            var uri      = new Uri(ci.url.Url);
            var siteType = HtmlParse.RecogSite(uri);
            var c        = new NCrawler.Crawler(uri, new HtmlDocumentProcessor(),
                                                new MyPipelineStep(ci))
            {
                MaximumCrawlDepth  = CrawlArgs.CrawlDepth(siteType),
                MaximumThreadCount = 5,
                IncludeFilter      = CrawlArgs.IncludeFilter(siteType),
                ExcludeFilter      = CrawlArgs.ExcludeFilter(siteType),
            };

            c.Crawl();
        }
Пример #10
0
        public String Crawl(int maxThreadCount = 4, int maxCrawlDepth = 100)
        {
            //Use an in-memory setup
            NCrawlerModule.Setup();

            //Always push the html document processor onto the pipeline
            this.pipelineSteps.Insert(0, new HtmlDocumentProcessor());
            using (Crawler c = new Crawler(this.root,this.pipelineSteps.ToArray<NCrawler.Interfaces.IPipelineStep>())
                  {
                      MaximumThreadCount = maxThreadCount,
                      MaximumCrawlDepth = maxCrawlDepth,
                      AdhereToRobotRules = false,
                  })
            {
                c.BeforeDownload += new EventHandler<NCrawler.Events.BeforeDownloadEventArgs>(c_BeforeDownload);
                c.AfterDownload += new EventHandler<NCrawler.Events.AfterDownloadEventArgs>(c_AfterDownload);
                c.DownloadProgress += new EventHandler<NCrawler.Events.DownloadProgressEventArgs>(c_DownloadProgress);
                c.Crawl();
            }

            return this.siteHash;
        }
Пример #11
0
		private static void Main(string[] args)
		{
			if (args == null || args.Length == 0)
			{
				arguments.ShowUsageShort();
			}
			else
			{
				arguments.m_StartupArgumentOptionSet.Parse(args);

				using (Crawler crawler = new Crawler(new Uri("http://ncrawler.codeplex.com"),
					new HtmlDocumentProcessor(),
					new ConsolePipelineStep()))
				{
					crawler.MaximumThreadCount = 10;
					crawler.Cancelled += crawler_Cancelled;
					crawler.DownloadException += crawler_DownloadException;
					crawler.DownloadProgress += crawler_DownloadProgress;
					crawler.PipelineException += crawler_PipelineException;
					crawler.Crawl();
				}
			}
		}
        public void Process(NCrawler.Crawler crawler, PropertyBag propertyBag)
        {
            var rsp = propertyBag.GetResponse();

            try
            {
                HtmlDocument htmlDoc  = HtmlParse.LoadFromHtml(propertyBag);
                var          siteType = HtmlParse.RecogSite(propertyBag.ResponseUri);
                var          records  = Parse(htmlDoc, siteType);
                if (records == null)
                {
                    return;
                }
                foreach (var record in records)
                {
                    DAL.Data.Add(record);
                    ++ci.Count;
                }
            }
            catch (NullReferenceException)
            {
            }
        }
Пример #13
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (!bindevents)
            {
                crawler.CrawlFinished += new EventHandler<CrawlFinishedEventArgs>(crawler_CrawlFinished);
                bindevents = true;
            }

            string id = config.GetDocumentPath(propertyBag.Step.Uri);

            if (propertyBag.StatusCode == System.Net.HttpStatusCode.OK)
            {
                repository.AddUpdate(id, propertyBag.Title, propertyBag.Text, propertyBag.LastModified);
                log.Info("Add/Update [" + id + "]");

            } else if (propertyBag.StatusCode == System.Net.HttpStatusCode.NotFound)
            {
                log.Warning("Crawler encoutered 404 for [" + id + "]");
                repository.Delete(id);
            } else
            {
                log.Warning(string.Format("Crawler encountered status {0} - {4} ({1}) for document {2} - {3}", propertyBag.StatusCode.ToString(), propertyBag.StatusDescription, id, propertyBag.Step.Uri, ((int)propertyBag.StatusCode).ToString()));
            }
        }
Пример #14
0
        /// <summary>
        /// </summary>
        /// <param name = "crawler">
        /// 	The crawler.
        /// </param>
        /// <param name = "propertyBag">
        /// 	The property bag.
        /// </param>
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
            string cultureDisplayValue = "N/A";
            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            lock (this)
            {
                Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri);
                Console.Out.WriteLine(ConsoleColor.Blue, "stuff -> " + propertyBag.Text);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}",
                    propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", Thread.CurrentThread.ManagedThreadId);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse);
                Console.Out.WriteLine();

            }
        }
Пример #15
0
 public void CrawlerSetting(Crawler c)
 {
     c.MaximumThreadCount = 2;
     c.MaximumCrawlDepth = 1;
 }
Пример #16
0
        private void CreateCrawler()
        {
            ServicePointManager.MaxServicePoints = 999999;
            ServicePointManager.DefaultConnectionLimit = 999999;
            ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls;
            ServicePointManager.CheckCertificateRevocationList = true;
            ServicePointManager.EnableDnsRoundRobin = true;

            var echo = new EchoStep();
            echo.OneWorkFinished += ShowText;

            _crawler = new Crawler(
                new Uri("http://www.cnblogs.com"),
                new HtmlDocumentProcessor(),
                echo
                )
                           {
                               MaximumThreadCount = 1,
                               MaximumCrawlDepth = 3,
                               ExcludeFilter = new[]
                                                   {
                                                       new RegexFilter(
                                                           new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico)",
                                                                     RegexOptions.Compiled |
                                                                     RegexOptions.CultureInvariant |
                                                                     RegexOptions.IgnoreCase))
                                                   },
                           };
        }
Пример #17
0
            /// <summary>
            /// </summary>
            /// <param name="crawler">
            /// The crawler.
            /// </param>
            /// <param name="propertyBag">
            /// The property bag.
            /// </param>
            public void Process(Crawler crawler, PropertyBag propertyBag)
            {
                //CultureInfo contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value;
                //string cultureDisplayValue = "N/A";
                //if (!contentCulture.IsNull())
                //{
                //    cultureDisplayValue = contentCulture.DisplayName;
                //}

                lock (this)
                {
                    //EchoControl.Invoke(new ShowTitleDelegate(ShowTitle), propertyBag.Title);
                    InvokeOneWorkFinished(propertyBag.Title);
                    //Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}", propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", System.Threading.Thread.CurrentThread.ManagedThreadId);
                    //Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse);
                    //Console.Out.WriteLine();
                }
            }
Пример #18
0
 public void RunCrawl(Crawler c)
 {
     c.Crawl();
 }
Пример #19
0
 public void Process(Crawler crawler, PropertyBag propertyBag)
 {
     HttpContext.Current.Response.Write("<br>FindUrl:" + propertyBag.Step.Uri);
 }
Пример #20
0
        public void Crawl()
        {
            using (Crawler c = new Crawler(new Uri(this.WebsiteUrl), new HtmlDocumentProcessor(FilterTextRules, FilterLinksRules), new DocumentIndexStep(this.Config, this.LogWrapper)))
            {
                this.LogWrapper.Info("Crawler started: Using " + MaximumThreadCount + " threads");

                c.AdhereToRobotRules = AdhereToRobotRules;
                c.MaximumThreadCount = MaximumThreadCount;
                c.ExcludeFilter = ExcludeFilter;
                c.UriSensitivity = UriSensitivity;
                c.MaximumCrawlDepth = MaximumCrawlDepth;
                c.Crawl();
            }
        }
Пример #21
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {


            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            string stepUri = Uri.UnescapeDataString(propertyBag.Step.Uri.AbsoluteUri);
            if (stepUri.Length > 396)
            {
                stepUri = stepUri.Substring(0, 396);
            }
            var crawlHistory = AspectF.Define.
               Return<CrawlHistory, NCrawlerEntitiesDbServices>(
                   e => e.CrawlHistory.Where(m => m.Key == stepUri).FirstOrDefault());

            if (crawlHistory == null)
            {
                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", stepUri);
                });
                return;
            }
            try
            {
                if (propertyBag.StatusCode != HttpStatusCode.OK)
                {
                    AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                    {
                        e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                        //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
                        //if (!result.IsNull())
                        //{
                        //    e.DeleteObject(result);
                        //    e.SaveChanges();
                        //}
                    });
                    return;
                }

                if (!IsHtmlContent(propertyBag.ContentType))
                {
                    AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                    {
                        e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                        //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
                        //if (!result.IsNull())
                        //{
                        //    e.DeleteObject(result);
                        //    e.SaveChanges();
                        //}
                    });
                    return;
                }
                HtmlDocument htmlDoc = new HtmlDocument
                {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };
                using (Stream reader = propertyBag.GetResponse())
                {
                    Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                    reader.Seek(0, SeekOrigin.Begin);
                    if (!documentEncoding.IsNull())
                    {
                        htmlDoc.Load(reader, documentEncoding, true);
                    }
                    else
                    {
                        htmlDoc.Load(reader, true);
                    }

                    //string content = reader.ReadToEnd();
                    //resultHtmlContent = content;
                }
                //string steplUri = propertyBag.ResponseUri.OriginalString;


                string orginalHtmlContent = htmlDoc.DocumentNode.OuterHtml;
                string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                DocumentWithLinks links = htmlDoc.GetLinks();



                //string urlRegex = @"^http://www.bbc.co.uk/food/recipes/[^#/]+$";
                List<string> recipeRegex = null;
                var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string;
                if (jsonStr == null)
                {
                    using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
                    {
                        jsonStr = stream.ReadToEnd();
                        var policy = new CacheItemPolicy();
                        policy.Priority = CacheItemPriority.NotRemovable;
                        policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
                        cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
                        Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite"));
                    }
                }
                var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr);
                if (json.RecipeRegex != null && json.RecipeRegex.Count > 0)
                {
                    recipeRegex = json.RecipeRegex;
                }
                bool needToStore = false;

                if (recipeRegex != null)
                {
                    foreach (var regex in recipeRegex)
                    {
                        if (Regex.IsMatch(propertyBag.Step.Uri.AbsoluteUri, regex, RegexOptions.IgnoreCase))
                        {
                            needToStore = true;
                            break;
                        }
                    }
                }
                else
                {
                    needToStore = true;
                }

                if (needToStore)
                {
                    //string folderPath = "D:/CrawlerManager/CrawlerData";
                    //string instanceFolderPath = folderPath + "/" + crawlHistory.GroupId;
                    //string path = folderPath + "/" + crawlHistory.GroupId + "/" + string.Format("{0}.txt", crawlHistory.Id);
                    //if (!Directory.Exists(folderPath))
                    //{
                    //    Directory.CreateDirectory(folderPath);
                    //}
                    //if (!Directory.Exists(instanceFolderPath))
                    //{
                    //    Directory.CreateDirectory(instanceFolderPath);
                    //}

                    //if (!File.Exists(path))
                    //{
                    //    try
                    //    {

                    //        using (StreamWriter sw = File.CreateText(path))
                    //        {
                    //            sw.WriteLine(orginalHtmlContent);
                    //        }

                    //    }
                    //    catch (Exception ex)
                    //    {
                    //        log4net.Config.XmlConfigurator.Configure();
                    //        log4net.ILog log = log4net.LogManager.GetLogger("logger-name");
                    //        log.Error(ex);
                    //    }
                    //}
                    var folderHelper = new FolderHelper();
                    var path = folderHelper.GetFolderPathToStore(crawlHistory.GroupId) + "/" + string.Format("{0}.txt", crawlHistory.Id);
                    Console.Write(path);

                    if (!File.Exists(path))
                    {
                        try
                        {
                            using (StreamWriter sw = File.CreateText(path))
                            {
                                sw.WriteLine(orginalHtmlContent);
                            }

                        }
                        catch (Exception ex)
                        {
                            log4net.Config.XmlConfigurator.Configure();
                            log4net.ILog log = log4net.LogManager.GetLogger("logger-name");
                            log.Error(ex);
                        }
                    }
                    //}
                }



                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                });

                foreach (string link in links.Links.Union(links.References))
                {
                    if (link.IsNullOrEmpty() || link.Length > 396)
                    {
                        continue;
                    }

                    string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                    string normalizedLink = "";
                    try
                    {
                        normalizedLink = NormalizeLink(baseUrl, decodedLink);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                    
                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }
                    if (link.Contains("page="))
                    {
                        var a = 1;
                    }


                    crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                        propertyBag.Step, new Dictionary<string, object>
                        {
                            {Resources.PropertyBagKeyOriginalUrl, link},
                            {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                        });
                }

            }
            catch (Exception ex)
            {
                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                });
                log4net.Config.XmlConfigurator.Configure();
                log4net.ILog log = log4net.LogManager.GetLogger("logger-name");
                log.Error(ex);
            }
        }
Пример #22
0
            public void Process(Crawler crawler, PropertyBag propertyBag)
            {
                if (propertyBag.Step.Uri.PathAndQuery.ToLower().Contains("project"))
                {
                    string a = "";
                }
                if (!string.IsNullOrEmpty(propertyBag.Text) && !PreviouslyIndexed(propertyBag.Step.Uri.ToString()))
                {

                    Lucene.Net.Documents.Document doc = new
                    Lucene.Net.Documents.Document();

                    //add string properties
                    Lucene.Net.Documents.Field fldURL = new Lucene.Net.Documents.Field("url", propertyBag.Step.Uri.ToString(), Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES);
                    doc.Add(fldURL);
                    Lucene.Net.Documents.Field fldContent = new Lucene.Net.Documents.Field("content", propertyBag.Text, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES);
                    doc.Add(fldContent);
                    Lucene.Net.Documents.Field fldTitle = new Lucene.Net.Documents.Field("title", propertyBag.Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.YES);
                    doc.Add(fldTitle);

                    //write the document to the index
                    indexWriter.AddDocument(doc);
                }
            }
Пример #23
0
        public static void SpiderThread()
        {
            //state the file location of the index
            Lucene.Net.Store.Directory dir = Lucene.Net.Store.FSDirectory.GetDirectory(indexDir, true);

            //create an analyzer to process the text
            Lucene.Net.Analysis.Analyzer analyzer = new
            Lucene.Net.Analysis.Standard.StandardAnalyzer();

            //create the index writer with the directory and analyzer defined.
            Lucene.Net.Index.IndexWriter indexWriter = new Lucene.Net.Index.IndexWriter(dir, analyzer, true);

            using
            (
                Crawler c = new Crawler
                (
                    new Uri(url),
                    new HtmlDocumentProcessor
                    (
                        new Dictionary<string, string>
                        {
                            {"<!--BeginTextFiler-->", "<!--EndTextFiler-->"}
                        },
                        new Dictionary<string, string>
                        {
                            {"<head", "</head>"}//skip any links in the head
                        }
                    ),
                    new DumperStep(indexWriter)
                )
                {
                    // Custom step to visualize crawl
                    MaximumThreadCount = threads,
                    MaximumCrawlDepth = 10,
                    ExcludeFilter = new[]
                    {
                        new RegexFilter(
                            new Regex(@"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico|\.axd)", RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase)
                            )
                    },
                }
            )
            {
                // Begin crawl
                c.Crawl();
            }

            //optimize and close the writer
            indexWriter.Optimize();
            indexWriter.Close();
        }
Пример #24
0
 public void Process(Crawler crawler, PropertyBag propertyBag)
 {
     Console.Out.WriteLine(propertyBag.Step.Uri);
 }
Пример #25
0
        /// <summary>
        /// The run.
        /// </summary>
        public static void Run()
        {
            ServicePointManager.MaxServicePoints = 999999;
               ServicePointManager.DefaultConnectionLimit = 999999;
               ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls;
               ServicePointManager.CheckCertificateRevocationList = true;
               ServicePointManager.EnableDnsRoundRobin = true;

               IFilter[] ExtensionsToSkip = new IFilter[2];
            ExtensionsToSkip[0] = new Filter();
            ExtensionsToSkip[1] =
                (RegexFilter)
                new Regex(
                    @"(\.jpg|\.css|\.js|\.gif|\.jpeg|\.png|\.ico)",
                    RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase);
            DbServicesModule.Setup(true);

            NCrawlerModule.Register(
               builder => builder.Register(
                  c=>new Log4NetLogService()).As<NCrawler.Interfaces.ILog>().InstancePerDependency());

            using (
                var c = new Crawler(
                    new Uri("http://www.cnblogs.com"),
                    new HtmlDocumentProcessor(),
                    new ParseByTemplate()
                    )
                    {
                        // Custom step to visualize crawl
                        MaximumThreadCount = 1,
                        MaximumCrawlDepth = 50,
                        ExcludeFilter = ExtensionsToSkip,
                       ConnectionTimeout=new TimeSpan(0,1,0),

                    })
            {
                c.AfterDownload += CrawlerAfterDownload;
                c.PipelineException += CrawlerPipelineException;
                c.DownloadException += CrawlerDownloadException;
                c.Crawl();
            }
        }
Пример #26
0
 public NCrawler.Crawler GetCrawler(string Absoluteurl)
 {
     Crawler c = null;
     c = new Crawler(new Uri(Absoluteurl, UriKind.Absolute), new HtmlDocumentProcessor(), new DumpResult());
     return c;
 }