public void AddPage(string url, string title, HtmlNode body) { DbTransaction trn = null; try { trn = cnn.BeginTransaction(); string originalPageText = HttpUtility.HtmlDecode(body.InnerText); string lowerPageText = originalPageText.ToLower(); int count = lowerPageText.Length; Document doc = new Document(url, title, count); doc.Insert(cnn); // Save a text copy of the HTML file. string textFileName = Path.GetFileNameWithoutExtension(url) + ".txt"; StreamWriter writer = new StreamWriter(textFilesDirectory + Path.DirectorySeparatorChar + textFileName); writer.WriteLine(originalPageText); writer.Close(); WordCounter counter = new WordCounter(count); // Get the text of the main body: AddText(lowerPageText, count, counter, false); // Add the text of the title: AddText(title.ToLower(), title.Length, counter, true); counter.DumpToDatabase(cnn, doc); trn.Commit(); } catch (Exception ex) { if (trn != null) { trn.Rollback(); } throw ex; } }
/// <summary> /// Add a text of a web page to the index /// </summary> /// <param name="text">Text of the web page to add.</param> /// <param name="count">Number of characters of the text.</param> /// <param name="counter">Counter used to indexate the web page.</param> /// <param name="titleText">True if the text is the title section of the web page. /// False if it's the web page body.</param> private void AddText( string text, int count, WordCounter counter , bool titleText ) { // Get a list of words: bool atWord = false; string currentWord = ""; int wordStart = 0; for (int i = 0; i < count; i++) { char c = Normalize(text[i]); if (GoodChar(c)) { if (!atWord) { currentWord = ""; atWord = true; wordStart = i; } currentWord += c; } else { if (atWord) { atWord = false; counter.Add(currentWord, wordStart, titleText ); } } } if (atWord) counter.Add(currentWord, wordStart, titleText ); }
public void AddPage(string url , string title , IHTMLElement body) { DbTransaction trn = null; try { trn = cnn.BeginTransaction(); string pageText = body.innerText.ToLower(); int count = pageText.Length; Document doc = new Document(url, title, count); doc.Insert(cnn); // Save a text copy of the HTML file. string textFileName = Path.GetFileNameWithoutExtension( url ) + ".txt"; StreamWriter writer = new StreamWriter(textFilesDirectory + Path.DirectorySeparatorChar + textFileName); writer.WriteLine(body.innerText); writer.Close(); WordCounter counter = new WordCounter(count); // Get the text of the main body: AddText(pageText, count, counter , false ); // Add the text of the title: AddText(title.ToLower(), title.Length, counter , true); counter.DumpToDatabase(cnn, doc); trn.Commit(); } catch( Exception ex ) { if (trn != null) trn.Rollback(); throw ex; } }