Example #1
0
        private void SetupStartingLinks()
        {
            List<string> urlsList = new List<string>();

            urlsList.Add("http://www.cs.put.poznan.pl/rwalkowiak/");
            urlsList.Add("http://www.anthonystruk.com/");
            urlsList.Add("http://www.cs.put.poznan.pl/mkalewski/");

            QueryLink l1 = new QueryLink(urlsList[0], 1);
            QueryLink l2 = new QueryLink(urlsList[1], 1);
            QueryLink l3 = new QueryLink(urlsList[2], 1);

            TO_SEARCH.Add(l1);
            TO_SEARCH.Add(l2);
            TO_SEARCH.Add(l3);

        }
Example #2
0
        private void IndexPages()
        {
            this.rtbServer.AppendText("starting indexing routine" + Environment.NewLine);

            SQLiteConnection connection;
            connection = new SQLiteConnection("Data Source=Words.sqlite;Version=3;");
            connection.Open();

            //clear the table
            string delete = "delete from websites";
            string vacuum = "vacuum";
            SQLiteCommand delete_com = new SQLiteCommand(delete, connection);
            delete_com.ExecuteNonQuery();
            SQLiteCommand  vacuum_com = new SQLiteCommand(vacuum, connection);
            vacuum_com.ExecuteNonQuery();

            while (TO_SEARCH.Any())
            {
                using (WebClient client = new WebClient())
                {

                    string url = TO_SEARCH.First().GetUrl();
                    Console.WriteLine(url);
                    //get original html from first list element
                    string html;
                    
                    try
                    {
                        html = client.DownloadString(url);


                        //get plain text without spaces
                        string text = GetPureTextFromHTML(html);
                        //remove markups and split text
                        text = Regex.Replace(text, @"\s+", "");
                        text = Regex.Replace(text, @"\t|\n|\r", @"\s");
                        text = Regex.Replace(text, @"'", "");
                        text = Regex.Replace(text, @"[\u0000-\u001F]", string.Empty); //remove non-ascii characters

                        List<string> rawLinks = GetLinks(html);
                        //good links
                        List<string> currPageLinks = CorrectLinks(rawLinks, url);
                        //depth of first element
                        int depth = TO_SEARCH.First().GetDepth();
                        Console.WriteLine(TO_SEARCH.Count);
                        if (depth <= SEARCH_DEPTH)
                        {
                            foreach (string link in currPageLinks)
                            {
                                QueryLink temp_link = new QueryLink(link, depth + 1);
                                TO_SEARCH.Add(temp_link);//append at the end
                            }
                        }
                        else
                        {
                            TO_SEARCH.RemoveAt(0);
                        }
                        //add to db!!
                        string sql = "insert or ignore into websites (url, content) values";
                        sql = sql + "('" + url + "', " + "'" + text + "')";
                        SQLiteCommand command = new SQLiteCommand(sql, connection);
                        command.ExecuteNonQuery();
                    }
                    catch (Exception)
                    {
                        TO_SEARCH.RemoveAt(0);
                        continue;
                    }
                    //remove first element - the one we worked with
                    if (TO_SEARCH.Any())
                        TO_SEARCH.RemoveAt(0);
                    else break;
                }
            }

            this.rtbServer.AppendText("finished" + Environment.NewLine);
            connection.Close();
        }