Example #1
0
        private static Uri[] GetLinks(string uri)
        {
            List <Uri> links = new List <Uri>();

            if (CNNPage.isTopicPage(uri))
            {
                Regex  regex = new Regex("<a href=\"http:[^\"#?]+[\"#?]");
                string page  = Downloader.FetchPage(uri);
                foreach (Match m in regex.Matches(page))
                {
                    Uri tmp = new Uri(m.Value.Substring(9, m.Value.Length - 10));
                    if (!links.Contains(tmp))
                    {
                        links.Add(tmp);
                    }
                }
            }
            else
            {
                CNNPage curr;
                try
                {
                    curr = new CNNPage(uri);
                    links.AddRange(curr.allLinks);
                }
                catch (Exception) { }
            }

            var nl = new List <Uri>();

            foreach (Uri u in links)
            {
                if (CNNPage.IsNewsPage(u.OriginalString) ||
                    CNNPage.isTopicPage(u.OriginalString))
                {
                    nl.Add(u);
                }
            }
            return(nl.ToArray());
        }
Example #2
0
        public void Do(CNNPage page)
        {
            try
            {
                using (SqlCommand command
                           = new SqlCommand(sql, conn))
                {
                    command.Parameters.AddWithValue("@url", page.uri);
                    command.Parameters.AddWithValue("@words", ToOneString(page.words, 4000));
                    command.Parameters.AddWithValue("@links", ToOneString(page.allLinks, 1000));
                    command.Parameters.AddWithValue("@raw", page.pureText);
                    command.Parameters.AddWithValue("@date", DateTime.Now.ToString());

                    command.ExecuteNonQuery();

                    Console.WriteLine("Saved");
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }
        }
Example #3
0
        public void Run()
        {
            while (true)
            {
                try
                {
                    //Console.WriteLine("PAGES VISITED: " + history.Count);
                    //Console.WriteLine("NEWS:" + history.NewsCount);
                    //Console.WriteLine("QUEUE SIZE: " + queue.Count);


                    if (queue.Count == 0)
                    {
                        Console.WriteLine("END");
                        break;
                    }

                    Uri curr = queue.Dequeue();
                    Console.WriteLine("PROCESSING: " + curr);

                    if (!history.WasVisited(curr.OriginalString))
                    {
                        try
                        {
                            //----
                            if (!CNNPage.IsNewsPage(curr.OriginalString) &&
                                !CNNPage.isTopicPage(curr.OriginalString))
                            {
                                history.SetVisited(curr);
                                Console.WriteLine("--");
                            }


                            //----
                            if (CNNPage.IsNewsPage(curr.OriginalString))
                            {
                                CNNPage page = new CNNPage(curr.OriginalString);
                                newsAction.Do(page);

                                //----
                                //Uri[] links = page.allLinks.ToArray();
                                //AddLinksToPagesToVisit(links);
                            }

                            //----
                            if (CNNPage.isTopicPage(curr.OriginalString))
                            {
                                var action = new TopicAction(conn);
                                action.Do(curr.OriginalString);

                                //----
                                Uri[] links = GetLinks(curr.OriginalString);
                                AddLinksToPagesToVisit(links);
                            }
                        }
                        finally
                        {
                            history.SetVisited(curr);
                            Console.WriteLine("--");
                        }
                    }
                    else
                    {
                        Console.WriteLine("SKIP: Page have been already visited");
                        Console.WriteLine("--");
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex);
                }
            }
        }
Example #4
0
        public void RunTopicRec()
        {
            while (true)
            {
                if (queue.Count == 0)
                {
                    Console.WriteLine("KURWA KONIEC");
                    break;
                }

                Uri curr = queue.Dequeue();
                Console.WriteLine("PROCESSING: " + curr);

                if (!history.WasVisited(curr.OriginalString))
                {
                    //SqlTransaction trans = conn.BeginTransaction();
                    try
                    {
                        if (CNNPage.isTopicPage(curr.OriginalString))
                        {
                            var action = new TopicAction(conn);
                            action.Do(curr.OriginalString);

                            Console.WriteLine("Przetwarzam newsy z topicu");

                            Uri[] links = GetLinks(curr.OriginalString);
                            for (int i = 0; i < links.Length; i++)
                            {
                                try
                                {
                                    if (CNNPage.IsNewsPage(links[i].OriginalString))
                                    {
                                        Console.WriteLine("News: " + i + "/" + links.Length);
                                        newsAction.Do(new CNNPage(links[i].OriginalString));
                                        Console.WriteLine("OK");
                                    }
                                    else
                                    {
                                        Console.WriteLine("Not news :(");
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Console.WriteLine("FAIL");
                                }
                            }
                            //trans.Commit();

                            Console.WriteLine("- Przetworzono topic -");
                        }

                        history.SetVisited(curr);
                        //trans.Commit();
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine("Wyst¹pi³ b³¹d: " + ex);
//                        trans.Rollback();
                    }
                }
            }
        }