示例#1
0
        private void IndexWords(string[] words, UrlAndIndex newPage)
        {
            MySqlTransaction sqlTran = myConnection.BeginTransaction ();

            var uniqueWords = new List<string> ();
            try {

                //int position = 0;
                for (int i=0; i<words.Length; i++) {
                    MySqlCommand setWord = myConnection.CreateCommand ();
                    //MySqlCommand setPosition = myConnection.CreateCommand ();
                    if (words [i] != "") {
                        if (!uniqueWords.Contains (words [i])) {//Adds word if it's the first time it appears
                            setWord.CommandText =
                         "Insert into words_rel_page (word,p_id) values (@word,@p_id);";
                            if (newPage.Url.Contains (words [i])) {//if the word is in the url

                                setWord.Parameters.AddWithValue ("@word", words[i]);
                                setWord.Parameters.AddWithValue ("@p_id", newPage.Index);
                            } else {
                                setWord.Parameters.AddWithValue ("@word", words[i]);
                                setWord.Parameters.AddWithValue ("@p_id", newPage.Index);

                            }
                            setWord.ExecuteNonQuery ();
                            uniqueWords.Add (words [i]);
                        }
                        //setPosition.CommandText =
                        // "Insert into positions (position,word,p_id) values (@position,@word,@p_id);";
                        //setPosition.Parameters.AddWithValue ("@word", words[i]);
                        //setPosition.Parameters.AddWithValue ("@p_id", newPage.Index);
                        //setPosition.Parameters.AddWithValue ("@position", position);
                        //setPosition.ExecuteNonQuery();
                        //position++;
                    }

                }
                sqlTran.Commit ();
            } catch (Exception e) {
                Console.Error.WriteLine ("Could not commit words from " + newPage.Url + " continuing");
                sqlTran.Rollback ();
            }
        }
示例#2
0
        /// <summary>
        /// Adds the new downloaded url to the database.
        /// </summary>
        /// <param name="i">To define the url index, and make easier the conversion to a matrix when calculating pagerank.</param>
        private void AddUrlToDb(UrlAndIndex newPage)
        {
            MySqlTransaction sqlTran = myConnection.BeginTransaction ();
            MySqlCommand createPage = myConnection.CreateCommand ();
            try {

                createPage.CommandText =
                 "Insert into pages (url,p_id,pagerank) values (@url,@ID,@pagerank);";
                createPage.Parameters.AddWithValue ("@url", newPage.Url);
                createPage.Parameters.AddWithValue ("@ID", newPage.Index);
                createPage.Parameters.AddWithValue ("@pagerank", 0);
                createPage.ExecuteNonQuery ();
                // Commit the transaction.
                sqlTran.Commit ();
                numberPages++;
                PagesInDatabase.Add (newPage);
                CrawlQueue.Enqueue (newPage);
            } catch (Exception e) {
                Console.Error.WriteLine ("Could not commit insert {0},skipping page ", newPage.Url);
                sqlTran.Rollback ();
                throw (e);
            }
        }
示例#3
0
        /// <summary>
        /// Finds the links among the webpage using regular expressions. Adds those not visited to the queue. In this version only root 
        /// links are evaluated.
        /// </summary>
        /// <param name="pageContents"> String containing the contents of the webpage</param>
        private void FindNewPages(string pageContents, UrlAndIndex newPage)
        {
            MatchCollection links = Regex.Matches (pageContents, "href=\".*?\"", RegexOptions.Singleline);//find new links
            //"<a href=.^>"
            foreach (Match element in links) {
                //Extract the link part
                //This version only parses the root of each webpage.
                String linkString = Regex.Match (element.ToString (), "https?://www\\..*?\\.(com|es|org|edu|uk|de)", RegexOptions.Singleline).ToString ();
                linkString = linkString.Replace ("https://", "http://");
                UrlAndIndex link = new UrlAndIndex (linkString, numberPages);
                if (link.Url != "") {
                    UrlAndIndex linkInDb = PagesInDatabase.Find (page => page.Url.Equals (link.Url));
                    if (linkInDb == null) {//if we havent visited the link yet we add it.
                        try {
                            AddUrlToDb (link);

                        } catch (Exception ex) {
                            continue;
                        }
                    } else {// We use the link already in db, or we would get a wrong p_id
                        link = linkInDb;
                    }

                    //Add link relationship to db
                    MySqlTransaction sqlTran = myConnection.BeginTransaction ();
                    MySqlCommand createLink = myConnection.CreateCommand ();
                    try {
                        //Can link to itself
                        createLink.CommandText =
                         "Insert IGNORE into links (p_id_linker,p_id_reciever) values (@Linker,@Reciever);";
                        createLink.Parameters.AddWithValue ("@Linker", newPage.Index);
                        createLink.Parameters.AddWithValue ("@Reciever", link.Index);
                        createLink.ExecuteNonQuery ();
                        // Commit the transaction.
                        sqlTran.Commit ();

                    } catch (Exception e) {
                        Console.Error.WriteLine ("Could not commit link " + link + "from " + newPage + " continuing");
                        sqlTran.Rollback ();
                    }
                }
            }
        }
示例#4
0
 void AddCrawledPage(UrlAndIndex newPage, string pageContents, string content,string title)
 {
     MySqlTransaction sqlTran = myConnection.BeginTransaction ();
     MySqlCommand createCrawledPage = myConnection.CreateCommand ();
     try {
         createCrawledPage.CommandText = "Insert into crawled_pages (p_id,words,content,title) values (@ID,@words,@content,@title);";
         createCrawledPage.Parameters.AddWithValue ("@ID", newPage.Index);
         createCrawledPage.Parameters.AddWithValue ("@words", pageContents);
         createCrawledPage.Parameters.AddWithValue ("@title", title);
         createCrawledPage.Parameters.AddWithValue ("@content", content);
         createCrawledPage.ExecuteNonQuery ();
         // Commit the transaction.
         sqlTran.Commit ();
     }
     catch (Exception e) {
         Console.Error.WriteLine ("Could not commit insert {0} into crawled, fatal error for now ", newPage.Url);
         sqlTran.Rollback ();
         System.Environment.Exit(1);
     }
 }