コード例 #1
0
        public List <string> PageIdx2Description(List <int> idx, int count = 10)
        {
            List <string> texts = new List <string>();

            foreach (int id in idx)
            {
                WEBPAGE page = GetPageByIndex(id);
                string  text = String.Format("PAGE_ID: {0}\n TITLE: {1}\nURL: {2}\n", id, page.TITLE, page.URL);
                texts.Add(text);
            }
            return(texts.Take(count).ToList());
        }
コード例 #2
0
        public WEBPAGE SingleStep()
        {
            if (to_visit_links.Count <= 0)
            {
                return(null);
            }

            string link = to_visit_links.Dequeue();

            Console.WriteLine(String.Format("{0} visited.", link));
            string content = "";

            if (!isLegal(link))
            {
                Console.WriteLine("Failed in visit {0}, because it is illegal", link);
                return(null);
            }
            else if (!DownloadHtml(link, out content))
            {
                Console.WriteLine("Failed in visit {0}, error: {1}", link, content);
                return(null);
            }

            WEBPAGE page = new WEBPAGE(link, content);

            if (!WebData.ContainsRecord(connection, link))
            {
                page.id = globalUniqueIndex++;
                WebData.SaveToLocal(page.id, content);
                WebData.LoadFromLocal(page.id, out content);
                WebData.SaveToDB(connection, link, content.GetMD5(), content.Length, page.id);
            }

            pages.Add(link, page);
            foreach (string next_link in page.LINKS)
            {
                next_link.Trim('/');
                if (!url_visited.Contains(next_link))
                {
                    to_visit_links.Enqueue(next_link);
                    url_visited.Add(next_link);
                }
            }
            return(page);
        }