static void Main(string[] args)
        {
            string startUrl = "http://www.ideaeng.com/";
            string baseUrl  = "http://www.ideaeng.com";

            Spider.Spider s = new Spider.Spider(startUrl, baseUrl, 500, 10);

            s.spider();

            List <SpiderPage> results = null;

            do
            {
                results = s.getResults();
            } while (results == null);

            for (int i = 0; i < results.Count; i++)
            {
                SpiderPage    curr         = results.ElementAt(i);
                List <string> curr_aliases = curr.getAliasUrls();
                List <string> curr_links   = curr.getLinkingToUrls();
                List <string> curr_refs    = curr.getReferencedByUrls();

                System.Console.WriteLine("\t" + curr.getUrl() + " has " + curr_links.Count + " alias(es):");
                for (int q = 0; q < curr_links.Count; q++)
                {
                    System.Console.WriteLine("\t\t" + curr_aliases.ElementAt(q));
                }

                System.Console.WriteLine("\t" + curr.getUrl() + " links to " + curr_links.Count + " page(s):");
                for (int k = 0; k < curr_links.Count; k++)
                {
                    System.Console.WriteLine("\t\t" + curr_links.ElementAt(k));
                }

                System.Console.WriteLine("\t" + curr.getUrl() + " is referred to by " + curr_refs.Count + " page(s):");
                for (int g = 0; g < curr_refs.Count; g++)
                {
                    System.Console.WriteLine("\t\t" + curr_refs.ElementAt(g));
                }

                System.Console.WriteLine("------------------------------------------------------------------------------------");
            }
        }
Beispiel #2
0
        static void printResults(List <SpiderPage> results)
        {
            for (int i = 0; i < results.Count; i++)
            {
                SpiderPage        curr         = results.ElementAt(i);
                List <string>     curr_aliases = curr.getAliasUrls();
                List <SpiderLink> curr_links   = curr.getLinkingToLinks();
                List <SpiderLink> curr_refs    = curr.getReferredByLinks();

                // make a filename into which we'll ouput this page's content
                string[] fileparts = (new Uri(curr.getUrl())).Segments;
                string   filename  = i + "-";
                if (fileparts.Length > 1)
                {
                    filename = filename + fileparts[1].Replace("/", "_");
                }
                filename = filename + ".html";

                System.Console.WriteLine("pages\\" + filename);
                StreamWriter sw = new StreamWriter("pages\\" + filename);
                sw.Write(curr.getPageContent());

                System.Console.WriteLine("--------------------------------------------------------------------");
                System.Console.WriteLine("REAL_PAGE - " + curr.getUrl());
                System.Console.WriteLine("--------------------------------------------------------------------");
                if (curr_aliases.Count > 0)
                {
                    System.Console.WriteLine("\t" + curr.getUrl() + " has these aliases (non-normalized):");
                    for (int j = 0; j < curr_aliases.Count; j++)
                    {
                        System.Console.WriteLine("\t\t" + curr_aliases.ElementAt(j));
                    }
                }
                else
                {
                    System.Console.WriteLine("\t0 aliases.");
                }

                System.Console.WriteLine("\t------------------------------------------------------------------------------");
                if (curr_links.Count > 0)
                {
                    System.Console.WriteLine("\t" + curr_links.Count + " linked-to page(s):");
                    for (int k = 0; k < curr_links.Count; k++)
                    {
                        string curr_link_url = curr_links.ElementAt(k).getNormalizedUrl();
                        if (!curr_links.ElementAt(k).isLegalLink())
                        {
                            curr_link_url = "< link not followed >";
                        }
                        System.Console.WriteLine("\t\t" + curr_link_url);
                        System.Console.WriteLine("\t\t\t original href text: " + curr_links.ElementAt(k).getOriginalUrl());
                    }
                }
                else
                {
                    System.Console.WriteLine("\t0 pages linked-to.");
                }

                System.Console.WriteLine("\t------------------------------------------------------------------------------");
                System.Console.WriteLine("\t" + curr_refs.Count + " referred-by page(s):");
                for (int q = 0; q < curr_refs.Count; q++)
                {
                    System.Console.WriteLine("\t\t" + curr_refs.ElementAt(q).getReferringUrl());
                    System.Console.WriteLine("\t\t\t original href text: " + curr_refs.ElementAt(q).getOriginalUrl());
                }
            }
        }