예제 #1
0
 //public int CrawlerNo { get; set; }
 protected SaveDataStructures(string name)
 {
     //CrawlerNo = no;
     Print.Show("Loading db " + QueueFile);
     Db = new LiteDatabase(QueueFile);
     Print.Show("Loading collection " + name);
     Collection = Db.GetCollection <Data>(name);
 }
예제 #2
0
        private void ReadInitialHashSet()
        {
            var items      = Collection.FindAll();
            var enumerable = items as Data[] ?? items.ToArray();

            Print.Show("Hashset count " + enumerable.Count());
            if (enumerable.Any())
            {
                _crawledUrls = new HashSet <string>(enumerable.Select(t => t.Content));
            }
        }
예제 #3
0
        private void ReadInitialQueue()
        {
            var queue = Collection.FindAll();

            Print.Show("Queue Item count " + queue.Count());
            var enumerable = queue as Data[] ?? queue.ToArray();

            if (enumerable.Any())
            {
                _queue = new Queue <string>(enumerable.Select(t => t.Content));
            }
        }
예제 #4
0
 public void Start()
 {
     Print.Show("Start crawling...");
     if (Queue.Count == 0)
     {
         foreach (var seed in Seeds)
         {
             DownloadAndEnqueue(seed);
         }
     }
     //Now Retrieving from Queue
     GetFromQueue();
 }
예제 #5
0
        public HtmlDocument Load()
        {
            Print.Show("Loading url: " + Uri.AbsoluteUri);
            HtmlDocument doc = null;

            try
            {
                doc = new HtmlWeb().Load(Uri.AbsoluteUri);
            }
            catch (Exception ex)
            {
                Print.Show(ex.Message);
                return(null);
            }

            Print.Show("Saving url: " + Uri.AbsoluteUri + " Size:" + doc.ParsedText.Length);
            if (doc.ParsedText == null || doc.ParsedText.Length < 10)
            {
                return(doc);
            }

            //get text from title and body
            var title = doc.DocumentNode.SelectSingleNode("//head//title");
            var body  = doc.DocumentNode.SelectSingleNode("//body");

            //remove script
            var nodes = body.SelectNodes("//script|//style");

            foreach (var node in nodes)
            {
                node.ParentNode.RemoveChild(node);
            }

            HtmlToTextConverter textConverter = new HtmlToTextConverter();

            Worker.SaveDocument(new SiteInfo
            {
                BodyContent  = textConverter.ToText(body.InnerText),
                TitleContent = textConverter.ToText(title.InnerText),
                Url          = Uri.AbsoluteUri
            });
            return(doc);
        }
예제 #6
0
        public IEnumerable <string> Parse(HtmlDocument html, Uri url)
        {
            Print.Show("Parsing url:" + url.AbsoluteUri);
            var linkedPages = html.DocumentNode.Descendants("a")
                              .Select(a => a.GetAttributeValue("href", null))
                              .Where(u => !String.IsNullOrEmpty(u));

            Print.Show(linkedPages.Count() + " urls in" + url.AbsoluteUri);

            List <string> uriList = new List <string>();

            foreach (string linkedPage in linkedPages)
            {
                if (linkedPage.Contains("#"))
                {
                    continue;
                }
                var result = ValidateUrl.Validate(linkedPage);
                try
                {
                    if (!result.Item1)
                    {
                        uriList.Add(new Uri(url, linkedPage).AbsoluteUri);
                    }
                    else
                    {
                        uriList.Add(linkedPage);
                    }
                }
                catch (Exception ex)
                {
                    Print.Show(ex.Message);
                }
            }
            return(uriList);
        }
예제 #7
0
 public SavingHashSet() : base("HashSet")
 {
     _crawledUrls = new HashSet <string>();
     Print.Show("Reading hashset...");
     ReadInitialHashSet();
 }
예제 #8
0
 public SavingQueue() : base("Queue")
 {
     _queue = new Queue <string>();
     Print.Show("Initializing Queue");
     ReadInitialQueue();
 }