private void SetupStartingLinks() { List<string> urlsList = new List<string>(); urlsList.Add("http://www.cs.put.poznan.pl/rwalkowiak/"); urlsList.Add("http://www.anthonystruk.com/"); urlsList.Add("http://www.cs.put.poznan.pl/mkalewski/"); QueryLink l1 = new QueryLink(urlsList[0], 1); QueryLink l2 = new QueryLink(urlsList[1], 1); QueryLink l3 = new QueryLink(urlsList[2], 1); TO_SEARCH.Add(l1); TO_SEARCH.Add(l2); TO_SEARCH.Add(l3); }
private void IndexPages() { this.rtbServer.AppendText("starting indexing routine" + Environment.NewLine); SQLiteConnection connection; connection = new SQLiteConnection("Data Source=Words.sqlite;Version=3;"); connection.Open(); //clear the table string delete = "delete from websites"; string vacuum = "vacuum"; SQLiteCommand delete_com = new SQLiteCommand(delete, connection); delete_com.ExecuteNonQuery(); SQLiteCommand vacuum_com = new SQLiteCommand(vacuum, connection); vacuum_com.ExecuteNonQuery(); while (TO_SEARCH.Any()) { using (WebClient client = new WebClient()) { string url = TO_SEARCH.First().GetUrl(); Console.WriteLine(url); //get original html from first list element string html; try { html = client.DownloadString(url); //get plain text without spaces string text = GetPureTextFromHTML(html); //remove markups and split text text = Regex.Replace(text, @"\s+", ""); text = Regex.Replace(text, @"\t|\n|\r", @"\s"); text = Regex.Replace(text, @"'", ""); text = Regex.Replace(text, @"[\u0000-\u001F]", string.Empty); //remove non-ascii characters List<string> rawLinks = GetLinks(html); //good links List<string> currPageLinks = CorrectLinks(rawLinks, url); //depth of first element int depth = TO_SEARCH.First().GetDepth(); Console.WriteLine(TO_SEARCH.Count); if (depth <= SEARCH_DEPTH) { foreach (string link in currPageLinks) { QueryLink temp_link = new QueryLink(link, depth + 1); TO_SEARCH.Add(temp_link);//append at the end } } else { TO_SEARCH.RemoveAt(0); } //add to db!! string sql = "insert or ignore into websites (url, content) values"; sql = sql + "('" + url + "', " + "'" + text + "')"; SQLiteCommand command = new SQLiteCommand(sql, connection); command.ExecuteNonQuery(); } catch (Exception) { TO_SEARCH.RemoveAt(0); continue; } //remove first element - the one we worked with if (TO_SEARCH.Any()) TO_SEARCH.RemoveAt(0); else break; } } this.rtbServer.AppendText("finished" + Environment.NewLine); connection.Close(); }