Exemplo n.º 1
0
 public WebSpider()
 {
     LoadBlackList(DataFilePath + @"\BlackList.txt");
     WebData.CreateDBFile();
     connection = WebData.ConnectDB();
     connection.Open();
     WebData.CreateWebTable(connection, "webs");
     if (Count > 0)
     {
         globalUniqueIndex = WebData.GetIntByExecution(connection, "select max(idx) from webs") + 1;
     }
 }
Exemplo n.º 2
0
        public void ReadDBIndex()
        {
            SQLiteDataReader reader = WebData.GetReaderByExecution(connection, "select url, idx from webs");
            string           url;
            int idx;

            url2idx.Clear();
            idx2url.Clear();
            while (reader.Read())
            {
                url = reader.GetString(0);
                idx = reader.GetInt32(1);
                url2idx.Add(url, idx);
                idx2url.Add(idx, url);
            }
        }
Exemplo n.º 3
0
        public WEBPAGE SingleStep()
        {
            if (to_visit_links.Count <= 0)
            {
                return(null);
            }

            string link = to_visit_links.Dequeue();

            Console.WriteLine(String.Format("{0} visited.", link));
            string content = "";

            if (!isLegal(link))
            {
                Console.WriteLine("Failed in visit {0}, because it is illegal", link);
                return(null);
            }
            else if (!DownloadHtml(link, out content))
            {
                Console.WriteLine("Failed in visit {0}, error: {1}", link, content);
                return(null);
            }

            WEBPAGE page = new WEBPAGE(link, content);

            if (!WebData.ContainsRecord(connection, link))
            {
                page.id = globalUniqueIndex++;
                WebData.SaveToLocal(page.id, content);
                WebData.LoadFromLocal(page.id, out content);
                WebData.SaveToDB(connection, link, content.GetMD5(), content.Length, page.id);
            }

            pages.Add(link, page);
            foreach (string next_link in page.LINKS)
            {
                next_link.Trim('/');
                if (!url_visited.Contains(next_link))
                {
                    to_visit_links.Enqueue(next_link);
                    url_visited.Add(next_link);
                }
            }
            return(page);
        }
Exemplo n.º 4
0
        public int ReadDBContent()
        {
            SQLiteDataReader reader = WebData.GetReaderByExecution(connection, "select url, md5 from webs");
            string           url, md5;

            url2page.Clear();
            while (reader.Read())
            {
                url = reader.GetString(0);
                md5 = reader.GetString(1);
                int idx = url2idx[url];
                if (WebData.LoadFromLocal(idx, out string content))
                {
                    if (content.GetMD5() == md5)
                    {
                        url2page.Add(url, new WEBPAGE(url, content));
                    }
                    else
                    {
                        Console.WriteLine("File at index {0} has different md5, md5 of content is {1}, but in db is {2}", idx, content.GetMD5(), md5);
                    }
                }
                else
                {
                    Console.WriteLine("Failed to load file at index {0}", idx);
                }
            }

            foreach (string src in url2idx.Keys)
            {
                foreach (string key in url2page[src].LINKS)
                {
                    if (url2idx.ContainsKey(key))
                    {
                        url2page[src].point_id.Add(url2idx[key]);
                    }
                }
            }
            return(url2page.Count);
        }
Exemplo n.º 5
0
 public DBReader()
 {
     connection = WebData.ConnectDB();
     connection.Open();
 }