public WebSpider() { LoadBlackList(DataFilePath + @"\BlackList.txt"); WebData.CreateDBFile(); connection = WebData.ConnectDB(); connection.Open(); WebData.CreateWebTable(connection, "webs"); if (Count > 0) { globalUniqueIndex = WebData.GetIntByExecution(connection, "select max(idx) from webs") + 1; } }
public void ReadDBIndex() { SQLiteDataReader reader = WebData.GetReaderByExecution(connection, "select url, idx from webs"); string url; int idx; url2idx.Clear(); idx2url.Clear(); while (reader.Read()) { url = reader.GetString(0); idx = reader.GetInt32(1); url2idx.Add(url, idx); idx2url.Add(idx, url); } }
public WEBPAGE SingleStep() { if (to_visit_links.Count <= 0) { return(null); } string link = to_visit_links.Dequeue(); Console.WriteLine(String.Format("{0} visited.", link)); string content = ""; if (!isLegal(link)) { Console.WriteLine("Failed in visit {0}, because it is illegal", link); return(null); } else if (!DownloadHtml(link, out content)) { Console.WriteLine("Failed in visit {0}, error: {1}", link, content); return(null); } WEBPAGE page = new WEBPAGE(link, content); if (!WebData.ContainsRecord(connection, link)) { page.id = globalUniqueIndex++; WebData.SaveToLocal(page.id, content); WebData.LoadFromLocal(page.id, out content); WebData.SaveToDB(connection, link, content.GetMD5(), content.Length, page.id); } pages.Add(link, page); foreach (string next_link in page.LINKS) { next_link.Trim('/'); if (!url_visited.Contains(next_link)) { to_visit_links.Enqueue(next_link); url_visited.Add(next_link); } } return(page); }
public int ReadDBContent() { SQLiteDataReader reader = WebData.GetReaderByExecution(connection, "select url, md5 from webs"); string url, md5; url2page.Clear(); while (reader.Read()) { url = reader.GetString(0); md5 = reader.GetString(1); int idx = url2idx[url]; if (WebData.LoadFromLocal(idx, out string content)) { if (content.GetMD5() == md5) { url2page.Add(url, new WEBPAGE(url, content)); } else { Console.WriteLine("File at index {0} has different md5, md5 of content is {1}, but in db is {2}", idx, content.GetMD5(), md5); } } else { Console.WriteLine("Failed to load file at index {0}", idx); } } foreach (string src in url2idx.Keys) { foreach (string key in url2page[src].LINKS) { if (url2idx.ContainsKey(key)) { url2page[src].point_id.Add(url2idx[key]); } } } return(url2page.Count); }
public DBReader() { connection = WebData.ConnectDB(); connection.Open(); }