Example #1
0
        public void AddPage(string url, string title, HtmlNode body)
        {
            DbTransaction trn = null;

            try
            {
                trn = cnn.BeginTransaction();

                string originalPageText = HttpUtility.HtmlDecode(body.InnerText);
                string lowerPageText    = originalPageText.ToLower();
                int    count            = lowerPageText.Length;

                Document doc = new Document(url, title, count);
                doc.Insert(cnn);

                // Save a text copy of the HTML file.
                string       textFileName = Path.GetFileNameWithoutExtension(url) + ".txt";
                StreamWriter writer       = new StreamWriter(textFilesDirectory + Path.DirectorySeparatorChar + textFileName);
                writer.WriteLine(originalPageText);
                writer.Close();

                WordCounter counter = new WordCounter(count);
                // Get the text of the main body:
                AddText(lowerPageText, count, counter, false);
                // Add the text of the title:
                AddText(title.ToLower(), title.Length, counter, true);

                counter.DumpToDatabase(cnn, doc);

                trn.Commit();
            }
            catch (Exception ex)
            {
                if (trn != null)
                {
                    trn.Rollback();
                }
                throw ex;
            }
        }
Example #2
0
        /// <summary>
        /// Add a text of a web page to the index
        /// </summary>
        /// <param name="text">Text of the web page to add.</param>
        /// <param name="count">Number of characters of the text.</param>
        /// <param name="counter">Counter used to indexate the web page.</param>
        /// <param name="titleText">True if the text is the title section of the web page.
        /// False if it's the web page body.</param>
        private void AddText( string text, int count, WordCounter counter , bool titleText )
        {
            // Get a list of words:
            bool atWord = false;
            string currentWord = "";
            int wordStart = 0;

            for (int i = 0; i < count; i++)
            {
                char c = Normalize(text[i]);
                if (GoodChar(c))
                {
                    if (!atWord)
                    {
                        currentWord = "";
                        atWord = true;
                        wordStart = i;
                    }
                    currentWord += c;
                }
                else
                {
                    if (atWord)
                    {
                        atWord = false;
                        counter.Add(currentWord, wordStart, titleText );
                    }
                }
            }
            if (atWord)
                counter.Add(currentWord, wordStart, titleText );
        }
Example #3
0
        public void AddPage(string url , string title , IHTMLElement body)
        {
            DbTransaction trn = null;

            try
            {
                trn = cnn.BeginTransaction();

                string pageText = body.innerText.ToLower();
                int count = pageText.Length;

                Document doc = new Document(url, title, count);
                doc.Insert(cnn);

                // Save a text copy of the HTML file.
                string textFileName = Path.GetFileNameWithoutExtension( url ) + ".txt";
                StreamWriter writer = new StreamWriter(textFilesDirectory + Path.DirectorySeparatorChar + textFileName);
                writer.WriteLine(body.innerText);
                writer.Close();

                WordCounter counter = new WordCounter(count);
                // Get the text of the main body:
                AddText(pageText, count, counter , false );
                // Add the text of the title:
                AddText(title.ToLower(), title.Length, counter , true);

                counter.DumpToDatabase(cnn, doc);

                trn.Commit();
            }
            catch( Exception ex )
            {
                if (trn != null)
                    trn.Rollback();
                throw ex;
            }
        }