예제 #1
0
파일: Program.cs 프로젝트: bolsson/Spider
        //const string Seed = "http://kenrockwell.com";

        static void Main(string[] args)
        {
            Spider        spider    = new Spider();
            LinkTable     linkTable = new LinkTable();
            ParseHtml     parser    = new ParseHtml();
            InvertedIndex store     = new InvertedIndex();

            while (linkTable.HasLink())
            {
                var link    = linkTable.GetLink();
                var webPage = spider.Crawl(link);
                if (webPage.Result == null || !webPage.Result.IsSuccessStatusCode || webPage.ToString().Length > 10000000 || webPage.Status == TaskStatus.Canceled || webPage.Status == TaskStatus.Faulted || webPage.IsFaulted)
                {
                    continue;
                }
                var htmlDoc = parser.GetDocument(webPage.Result);
                if (htmlDoc.Status == TaskStatus.Faulted || htmlDoc.Status == TaskStatus.Canceled)
                {
                    continue;
                }
                var linksOnPage = parser.GetLinks(htmlDoc.Result);
                var wordsOnPage = parser.GetWords(htmlDoc.Result);
                store.Add(link, wordsOnPage);

                linkTable.Add(linksOnPage);
            }
        }
예제 #2
0
파일: Evaluater.cs 프로젝트: bolsson/Spider
 public Evaluater(Node root, InvertedIndex index)
 {
     queue      = new Queue <Token>();
     this.stack = new Stack <IEnumerable <string> >();
     _index     = index;
     evaluaterOrder(root);
     result = stack.Pop();
 }
예제 #3
0
파일: Evaluater.cs 프로젝트: bolsson/Spider
 public Evaluater(Node root, InvertedIndex index)
 {
     queue = new Queue<Token>();
     this.stack = new Stack<IEnumerable<string>>();
     _index = index;
     evaluaterOrder(root);
     result = stack.Pop();
 }
예제 #4
0
 public void Init()
 {
     //Note fill testdata  into the invertedindex -- in progress
     index = new InvertedIndex();
     index.Add(doc1, new List<string>() { "word1", "word2", "word3", "word4" });
     index.Add(doc2, new List<string>() { "word2", "word4", "word6", "word8" });
     index.Add(doc3, new List<string>() { "word3", "word6", "word9", "word12" });
     index.Add(doc4, new List<string>() { "word4", "word8", "word12", "word16" });
     index.Add(doc5, new List<string>() { "word5", "word10", "word15", "word20" });
     index.Add(doc6, new List<string>() { "word6", "word12", "word18", "word24" });
 }
예제 #5
0
파일: Program.cs 프로젝트: bolsson/Spider
        //const string Seed = "http://kenrockwell.com";

        static void Main(string[] args)
        {
            Spider spider = new Spider();
            LinkTable linkTable = new LinkTable();
            ParseHtml parser = new ParseHtml();
            InvertedIndex store = new InvertedIndex();

            while (linkTable.HasLink())
            {
                var link = linkTable.GetLink();
                var webPage = spider.Crawl(link);
                if (webPage.Result == null || !webPage.Result.IsSuccessStatusCode || webPage.ToString().Length > 10000000 || webPage.Status == TaskStatus.Canceled || webPage.Status == TaskStatus.Faulted || webPage.IsFaulted ) continue;
                var htmlDoc = parser.GetDocument(webPage.Result);
                if (htmlDoc.Status == TaskStatus.Faulted || htmlDoc.Status == TaskStatus.Canceled)
                {
                    continue;
                }
                var linksOnPage = parser.GetLinks(htmlDoc.Result);
                var wordsOnPage = parser.GetWords(htmlDoc.Result);
                store.Add(link, wordsOnPage);
                
                linkTable.Add(linksOnPage);
            }
        }