private static Uri[] GetLinks(string uri) { List <Uri> links = new List <Uri>(); if (CNNPage.isTopicPage(uri)) { Regex regex = new Regex("<a href=\"http:[^\"#?]+[\"#?]"); string page = Downloader.FetchPage(uri); foreach (Match m in regex.Matches(page)) { Uri tmp = new Uri(m.Value.Substring(9, m.Value.Length - 10)); if (!links.Contains(tmp)) { links.Add(tmp); } } } else { CNNPage curr; try { curr = new CNNPage(uri); links.AddRange(curr.allLinks); } catch (Exception) { } } var nl = new List <Uri>(); foreach (Uri u in links) { if (CNNPage.IsNewsPage(u.OriginalString) || CNNPage.isTopicPage(u.OriginalString)) { nl.Add(u); } } return(nl.ToArray()); }
public void Do(CNNPage page) { try { using (SqlCommand command = new SqlCommand(sql, conn)) { command.Parameters.AddWithValue("@url", page.uri); command.Parameters.AddWithValue("@words", ToOneString(page.words, 4000)); command.Parameters.AddWithValue("@links", ToOneString(page.allLinks, 1000)); command.Parameters.AddWithValue("@raw", page.pureText); command.Parameters.AddWithValue("@date", DateTime.Now.ToString()); command.ExecuteNonQuery(); Console.WriteLine("Saved"); } } catch (Exception ex) { Console.WriteLine(ex); } }
public void Run() { while (true) { try { //Console.WriteLine("PAGES VISITED: " + history.Count); //Console.WriteLine("NEWS:" + history.NewsCount); //Console.WriteLine("QUEUE SIZE: " + queue.Count); if (queue.Count == 0) { Console.WriteLine("END"); break; } Uri curr = queue.Dequeue(); Console.WriteLine("PROCESSING: " + curr); if (!history.WasVisited(curr.OriginalString)) { try { //---- if (!CNNPage.IsNewsPage(curr.OriginalString) && !CNNPage.isTopicPage(curr.OriginalString)) { history.SetVisited(curr); Console.WriteLine("--"); } //---- if (CNNPage.IsNewsPage(curr.OriginalString)) { CNNPage page = new CNNPage(curr.OriginalString); newsAction.Do(page); //---- //Uri[] links = page.allLinks.ToArray(); //AddLinksToPagesToVisit(links); } //---- if (CNNPage.isTopicPage(curr.OriginalString)) { var action = new TopicAction(conn); action.Do(curr.OriginalString); //---- Uri[] links = GetLinks(curr.OriginalString); AddLinksToPagesToVisit(links); } } finally { history.SetVisited(curr); Console.WriteLine("--"); } } else { Console.WriteLine("SKIP: Page have been already visited"); Console.WriteLine("--"); } } catch (Exception ex) { Console.WriteLine(ex); } } }
public void RunTopicRec() { while (true) { if (queue.Count == 0) { Console.WriteLine("KURWA KONIEC"); break; } Uri curr = queue.Dequeue(); Console.WriteLine("PROCESSING: " + curr); if (!history.WasVisited(curr.OriginalString)) { //SqlTransaction trans = conn.BeginTransaction(); try { if (CNNPage.isTopicPage(curr.OriginalString)) { var action = new TopicAction(conn); action.Do(curr.OriginalString); Console.WriteLine("Przetwarzam newsy z topicu"); Uri[] links = GetLinks(curr.OriginalString); for (int i = 0; i < links.Length; i++) { try { if (CNNPage.IsNewsPage(links[i].OriginalString)) { Console.WriteLine("News: " + i + "/" + links.Length); newsAction.Do(new CNNPage(links[i].OriginalString)); Console.WriteLine("OK"); } else { Console.WriteLine("Not news :("); } } catch (Exception ex) { Console.WriteLine("FAIL"); } } //trans.Commit(); Console.WriteLine("- Przetworzono topic -"); } history.SetVisited(curr); //trans.Commit(); } catch (Exception ex) { Console.WriteLine("Wyst¹pi³ b³¹d: " + ex); // trans.Rollback(); } } } }