示例#1
0
文件: Program.cs 项目: bolsson/Spider
        //const string Seed = "http://kenrockwell.com";

        static void Main(string[] args)
        {
            Spider        spider    = new Spider();
            LinkTable     linkTable = new LinkTable();
            ParseHtml     parser    = new ParseHtml();
            InvertedIndex store     = new InvertedIndex();

            while (linkTable.HasLink())
            {
                var link    = linkTable.GetLink();
                var webPage = spider.Crawl(link);
                if (webPage.Result == null || !webPage.Result.IsSuccessStatusCode || webPage.ToString().Length > 10000000 || webPage.Status == TaskStatus.Canceled || webPage.Status == TaskStatus.Faulted || webPage.IsFaulted)
                {
                    continue;
                }
                var htmlDoc = parser.GetDocument(webPage.Result);
                if (htmlDoc.Status == TaskStatus.Faulted || htmlDoc.Status == TaskStatus.Canceled)
                {
                    continue;
                }
                var linksOnPage = parser.GetLinks(htmlDoc.Result);
                var wordsOnPage = parser.GetWords(htmlDoc.Result);
                store.Add(link, wordsOnPage);

                linkTable.Add(linksOnPage);
            }
        }
示例#2
0
文件: Program.cs 项目: bolsson/Spider
        //const string Seed = "http://kenrockwell.com";

        static void Main(string[] args)
        {
            Spider spider = new Spider();
            LinkTable linkTable = new LinkTable();
            ParseHtml parser = new ParseHtml();
            InvertedIndex store = new InvertedIndex();

            while (linkTable.HasLink())
            {
                var link = linkTable.GetLink();
                var webPage = spider.Crawl(link);
                if (webPage.Result == null || !webPage.Result.IsSuccessStatusCode || webPage.ToString().Length > 10000000 || webPage.Status == TaskStatus.Canceled || webPage.Status == TaskStatus.Faulted || webPage.IsFaulted ) continue;
                var htmlDoc = parser.GetDocument(webPage.Result);
                if (htmlDoc.Status == TaskStatus.Faulted || htmlDoc.Status == TaskStatus.Canceled)
                {
                    continue;
                }
                var linksOnPage = parser.GetLinks(htmlDoc.Result);
                var wordsOnPage = parser.GetWords(htmlDoc.Result);
                store.Add(link, wordsOnPage);
                
                linkTable.Add(linksOnPage);
            }
        }