private void IndexWords(string[] words, UrlAndIndex newPage) { MySqlTransaction sqlTran = myConnection.BeginTransaction (); var uniqueWords = new List<string> (); try { //int position = 0; for (int i=0; i<words.Length; i++) { MySqlCommand setWord = myConnection.CreateCommand (); //MySqlCommand setPosition = myConnection.CreateCommand (); if (words [i] != "") { if (!uniqueWords.Contains (words [i])) {//Adds word if it's the first time it appears setWord.CommandText = "Insert into words_rel_page (word,p_id) values (@word,@p_id);"; if (newPage.Url.Contains (words [i])) {//if the word is in the url setWord.Parameters.AddWithValue ("@word", words[i]); setWord.Parameters.AddWithValue ("@p_id", newPage.Index); } else { setWord.Parameters.AddWithValue ("@word", words[i]); setWord.Parameters.AddWithValue ("@p_id", newPage.Index); } setWord.ExecuteNonQuery (); uniqueWords.Add (words [i]); } //setPosition.CommandText = // "Insert into positions (position,word,p_id) values (@position,@word,@p_id);"; //setPosition.Parameters.AddWithValue ("@word", words[i]); //setPosition.Parameters.AddWithValue ("@p_id", newPage.Index); //setPosition.Parameters.AddWithValue ("@position", position); //setPosition.ExecuteNonQuery(); //position++; } } sqlTran.Commit (); } catch (Exception e) { Console.Error.WriteLine ("Could not commit words from " + newPage.Url + " continuing"); sqlTran.Rollback (); } }
/// <summary> /// Adds the new downloaded url to the database. /// </summary> /// <param name="i">To define the url index, and make easier the conversion to a matrix when calculating pagerank.</param> private void AddUrlToDb(UrlAndIndex newPage) { MySqlTransaction sqlTran = myConnection.BeginTransaction (); MySqlCommand createPage = myConnection.CreateCommand (); try { createPage.CommandText = "Insert into pages (url,p_id,pagerank) values (@url,@ID,@pagerank);"; createPage.Parameters.AddWithValue ("@url", newPage.Url); createPage.Parameters.AddWithValue ("@ID", newPage.Index); createPage.Parameters.AddWithValue ("@pagerank", 0); createPage.ExecuteNonQuery (); // Commit the transaction. sqlTran.Commit (); numberPages++; PagesInDatabase.Add (newPage); CrawlQueue.Enqueue (newPage); } catch (Exception e) { Console.Error.WriteLine ("Could not commit insert {0},skipping page ", newPage.Url); sqlTran.Rollback (); throw (e); } }
/// <summary> /// Finds the links among the webpage using regular expressions. Adds those not visited to the queue. In this version only root /// links are evaluated. /// </summary> /// <param name="pageContents"> String containing the contents of the webpage</param> private void FindNewPages(string pageContents, UrlAndIndex newPage) { MatchCollection links = Regex.Matches (pageContents, "href=\".*?\"", RegexOptions.Singleline);//find new links //"<a href=.^>" foreach (Match element in links) { //Extract the link part //This version only parses the root of each webpage. String linkString = Regex.Match (element.ToString (), "https?://www\\..*?\\.(com|es|org|edu|uk|de)", RegexOptions.Singleline).ToString (); linkString = linkString.Replace ("https://", "http://"); UrlAndIndex link = new UrlAndIndex (linkString, numberPages); if (link.Url != "") { UrlAndIndex linkInDb = PagesInDatabase.Find (page => page.Url.Equals (link.Url)); if (linkInDb == null) {//if we havent visited the link yet we add it. try { AddUrlToDb (link); } catch (Exception ex) { continue; } } else {// We use the link already in db, or we would get a wrong p_id link = linkInDb; } //Add link relationship to db MySqlTransaction sqlTran = myConnection.BeginTransaction (); MySqlCommand createLink = myConnection.CreateCommand (); try { //Can link to itself createLink.CommandText = "Insert IGNORE into links (p_id_linker,p_id_reciever) values (@Linker,@Reciever);"; createLink.Parameters.AddWithValue ("@Linker", newPage.Index); createLink.Parameters.AddWithValue ("@Reciever", link.Index); createLink.ExecuteNonQuery (); // Commit the transaction. sqlTran.Commit (); } catch (Exception e) { Console.Error.WriteLine ("Could not commit link " + link + "from " + newPage + " continuing"); sqlTran.Rollback (); } } } }
void AddCrawledPage(UrlAndIndex newPage, string pageContents, string content,string title) { MySqlTransaction sqlTran = myConnection.BeginTransaction (); MySqlCommand createCrawledPage = myConnection.CreateCommand (); try { createCrawledPage.CommandText = "Insert into crawled_pages (p_id,words,content,title) values (@ID,@words,@content,@title);"; createCrawledPage.Parameters.AddWithValue ("@ID", newPage.Index); createCrawledPage.Parameters.AddWithValue ("@words", pageContents); createCrawledPage.Parameters.AddWithValue ("@title", title); createCrawledPage.Parameters.AddWithValue ("@content", content); createCrawledPage.ExecuteNonQuery (); // Commit the transaction. sqlTran.Commit (); } catch (Exception e) { Console.Error.WriteLine ("Could not commit insert {0} into crawled, fatal error for now ", newPage.Url); sqlTran.Rollback (); System.Environment.Exit(1); } }