public void Dispose() { MONITOR.objects_disposed++; MONITOR.LogDiagnostic("object has been disposed." + MONITOR.objects_disposed); }
public List <string> spawn(string url) //public void spawn(string url) { HtmlWeb hw = new HtmlWeb(); HtmlDocument doc = new HtmlDocument(); List <string> hrefTags = new List <string>(); List <string> text = new List <string>(); List <string> links_from_this_page = new List <string>(); Random rnd = new Random(); int wait_time = 5000 + rnd.Next(1000); //10000 + rnd.Next(1000); //MONITOR.Log( "Elapsed: " + (DateTime.Now.Subtract(MONITOR.START_TIME).TotalMinutes.ToString() +" wt@"+wait_time + "mS " + url)); MONITOR.LogMulti("Elapsed: " + (DateTime.Now.Subtract(MONITOR.START_TIME).TotalMinutes.ToString() + ", wt@" + wait_time + "mS, " + url)); try { System.Threading.Thread.Sleep(wait_time); doc = hw.Load(url); } catch (Exception e) { MONITOR.LogError(e.Message); } try { foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//text()")) { text.Add(node.InnerText); } } catch (Exception e) { MONITOR.LogError(e.Message); } try { foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]")) { HtmlAttribute att = link.Attributes["href"]; hrefTags.Add(att.Value); } } catch (Exception e) { MONITOR.LogError(e.Message); } doc = null; GC.Collect(); foreach (var tag in hrefTags) { if (Regex.IsMatch(tag, @"profile/")) { var user = tag.Split(("/").ToCharArray()).Last(); if (!MONITOR.listOfUniqueUsers.Contains(user)) { using (SQLiteConnection qdbconnection = new SQLiteConnection("Data Source=D:/cache/quora.sqlite;Version=3")) { qdbconnection.Open(); SQLiteCommand command = new SQLiteCommand(@"INSERT INTO quora (name, link) values ('" + user + "', '" + url + "')", qdbconnection); command.ExecuteNonQuery(); } MONITOR.listOfUniqueUsers.Add(user); } } } //using (StreamWriter sw2 = File.AppendText(@"D:/cache/cache-quora-text.txt")) //{ // sw2.WriteLine(String.Join(String.Empty, text.ToArray())); //} //using (StreamWriter sw = new StreamWriter(@"D:/cache/cache-quora-usernames.txt")) //{ // sw.WriteLine(String.Join(Environment.NewLine, MONITOR.listOfUniqueUsers.ToArray())); //} //using (StreamWriter sw3 = new StreamWriter(@"D:/cache/cache-quora-visited-sites.txt")) //{ // sw3.WriteLine(String.Join(Environment.NewLine, MONITOR.visitedpages.ToArray())); //} Parallel.ForEach(hrefTags, hrefTag => { if (!hrefTag.Contains("topic") && !hrefTag.Contains("profile") && !hrefTag.Contains("answer") && !hrefTag.Contains("careers/")) { //if (!MONITOR.visitedpages.Contains(hrefTag)) //{ if (hrefTag[0].Equals('/') && hrefTag.Contains("-")) { try { links_from_this_page.Add(@"https://www.quora.com" + hrefTag.Trim(Environment.NewLine.ToCharArray())); MONITOR.listOfUnvisitedPages.Add(@"https://www.quora.com" + hrefTag.Trim(Environment.NewLine.ToCharArray())); } catch (Exception e) { MONITOR.LogError(e.Message + e.StackTrace); } //using (spider n = new spider()) //{ // n.spawn(@"https://www.quora.com" + hrefTag); // Dispose(); //} } //} } }); //MONITOR.visitedpages.Add(url); return(links_from_this_page); }
//Server=localhost\SQLEXPRESS;Database=master;Trusted_Connection=True; static void Main(string[] args) { Console.SetBufferSize(480, 160); string url = @"https://www.quora.com/Can-social-networks-be-scraped-for-data"; string url2 = @"https://www.instagram.com/angiesuco/"; MONITOR.START_TIME = DateTime.Now; MONITOR.pages_visited = 0; MONITOR.objects_disposed = 0; //using (spider s = new spider()) //{ // s.spawn(url); //} //foreach(var child in MONITOR.listOfUnvisitedPages) //{ // using (spider s = new spider()) // { // s.spawn(child); // } //} var stack = new Stack <List <string> >(); using (spider n = new spider()) { stack.Push(n.spawn(url)); while (stack.Any()) { try { var current = stack.Pop(); foreach (var child in current) { try { using (spider n2 = new spider()) { stack.Push(n2.spawn(child)); } } catch (Exception e) { MONITOR.LogError(e.Message); } } } catch (Exception e) { MONITOR.LogError(e.Message); } } } Console.WriteLine("PAGES VISITED: " + MONITOR.pages_visited + ". OBJECTS DISPOSED: " + MONITOR.objects_disposed); //using (spider n = new spider()) //{ // Parallel.ForEach( n.spawn(url), page =>{ // using (spider n2 = new spider()) // { // n2.spawn(page); // } // }); //} }
public void R(string url) { HtmlWeb hw = new HtmlWeb(); HtmlDocument doc = hw.Load(url); List <string> hrefTags = new List <string>(); List <string> text = new List <string>(); foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//text()")) { if (node.InnerText.Length > 10) { text.Add(node.InnerText); } } foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]")) { HtmlAttribute att = link.Attributes["href"]; hrefTags.Add(att.Value); } List <string> uniqueUsers = new List <string>(); List <string> userTags = new List <string>(); foreach (var tag in hrefTags) { if (Regex.IsMatch(tag, @"users/\d+")) { var user = tag.Split(("/").ToCharArray()).Last(); userTags.Add(user); if (!uniqueUsers.Contains(user) && !MONITOR.listOfUniqueUsers.Contains(user)) { uniqueUsers.Add(user); MONITOR.listOfUniqueUsers.Add(user); using (StreamWriter sw = File.AppendText(@"D:/cache/cache-so-usernames.txt")) { sw.WriteLine(user); //MONITOR.Log(user); } } } } foreach (var tag in hrefTags) { if (Regex.IsMatch(tag, @"http.*?questions/\d+")) { if (!tag.Contains("promotion")) { if (!MONITOR.visitedpages.Contains(tag)) { MONITOR.Log(tag); System.Threading.Thread.Sleep(60000); MONITOR.visitedpages.Add(tag); using (N n = new N()) { n.R(tag); } } } } } }
public void Dispose() { MONITOR.Log("object has been disposed."); }
public static void LogError(string data) { Console.ForegroundColor = ConsoleColor.Red; MONITOR.Log(data); Console.ResetColor(); }
public static void LogDiagnostic(string data) { Console.ForegroundColor = ConsoleColor.Cyan; MONITOR.Log(data); Console.ResetColor(); }