//public int CrawlerNo { get; set; } protected SaveDataStructures(string name) { //CrawlerNo = no; Print.Show("Loading db " + QueueFile); Db = new LiteDatabase(QueueFile); Print.Show("Loading collection " + name); Collection = Db.GetCollection <Data>(name); }
private void ReadInitialHashSet() { var items = Collection.FindAll(); var enumerable = items as Data[] ?? items.ToArray(); Print.Show("Hashset count " + enumerable.Count()); if (enumerable.Any()) { _crawledUrls = new HashSet <string>(enumerable.Select(t => t.Content)); } }
private void ReadInitialQueue() { var queue = Collection.FindAll(); Print.Show("Queue Item count " + queue.Count()); var enumerable = queue as Data[] ?? queue.ToArray(); if (enumerable.Any()) { _queue = new Queue <string>(enumerable.Select(t => t.Content)); } }
public void Start() { Print.Show("Start crawling..."); if (Queue.Count == 0) { foreach (var seed in Seeds) { DownloadAndEnqueue(seed); } } //Now Retrieving from Queue GetFromQueue(); }
public HtmlDocument Load() { Print.Show("Loading url: " + Uri.AbsoluteUri); HtmlDocument doc = null; try { doc = new HtmlWeb().Load(Uri.AbsoluteUri); } catch (Exception ex) { Print.Show(ex.Message); return(null); } Print.Show("Saving url: " + Uri.AbsoluteUri + " Size:" + doc.ParsedText.Length); if (doc.ParsedText == null || doc.ParsedText.Length < 10) { return(doc); } //get text from title and body var title = doc.DocumentNode.SelectSingleNode("//head//title"); var body = doc.DocumentNode.SelectSingleNode("//body"); //remove script var nodes = body.SelectNodes("//script|//style"); foreach (var node in nodes) { node.ParentNode.RemoveChild(node); } HtmlToTextConverter textConverter = new HtmlToTextConverter(); Worker.SaveDocument(new SiteInfo { BodyContent = textConverter.ToText(body.InnerText), TitleContent = textConverter.ToText(title.InnerText), Url = Uri.AbsoluteUri }); return(doc); }
public IEnumerable <string> Parse(HtmlDocument html, Uri url) { Print.Show("Parsing url:" + url.AbsoluteUri); var linkedPages = html.DocumentNode.Descendants("a") .Select(a => a.GetAttributeValue("href", null)) .Where(u => !String.IsNullOrEmpty(u)); Print.Show(linkedPages.Count() + " urls in" + url.AbsoluteUri); List <string> uriList = new List <string>(); foreach (string linkedPage in linkedPages) { if (linkedPage.Contains("#")) { continue; } var result = ValidateUrl.Validate(linkedPage); try { if (!result.Item1) { uriList.Add(new Uri(url, linkedPage).AbsoluteUri); } else { uriList.Add(linkedPage); } } catch (Exception ex) { Print.Show(ex.Message); } } return(uriList); }
public SavingHashSet() : base("HashSet") { _crawledUrls = new HashSet <string>(); Print.Show("Reading hashset..."); ReadInitialHashSet(); }
public SavingQueue() : base("Queue") { _queue = new Queue <string>(); Print.Show("Initializing Queue"); ReadInitialQueue(); }