public List <string> PageIdx2Description(List <int> idx, int count = 10) { List <string> texts = new List <string>(); foreach (int id in idx) { WEBPAGE page = GetPageByIndex(id); string text = String.Format("PAGE_ID: {0}\n TITLE: {1}\nURL: {2}\n", id, page.TITLE, page.URL); texts.Add(text); } return(texts.Take(count).ToList()); }
public WEBPAGE SingleStep() { if (to_visit_links.Count <= 0) { return(null); } string link = to_visit_links.Dequeue(); Console.WriteLine(String.Format("{0} visited.", link)); string content = ""; if (!isLegal(link)) { Console.WriteLine("Failed in visit {0}, because it is illegal", link); return(null); } else if (!DownloadHtml(link, out content)) { Console.WriteLine("Failed in visit {0}, error: {1}", link, content); return(null); } WEBPAGE page = new WEBPAGE(link, content); if (!WebData.ContainsRecord(connection, link)) { page.id = globalUniqueIndex++; WebData.SaveToLocal(page.id, content); WebData.LoadFromLocal(page.id, out content); WebData.SaveToDB(connection, link, content.GetMD5(), content.Length, page.id); } pages.Add(link, page); foreach (string next_link in page.LINKS) { next_link.Trim('/'); if (!url_visited.Contains(next_link)) { to_visit_links.Enqueue(next_link); url_visited.Add(next_link); } } return(page); }