示例#1
0
        private static void SavePageIndex(IndexedPage newIndexedPage)
        {
            using (var context = new DatabaseContext())
            {
                //truncate the content
                if (newIndexedPage.Content.Length > 200)
                {
                    newIndexedPage.Content = newIndexedPage.Content.Substring(0, 200);
                }

                if (!context.IndexedPages.Any(p => p.Url == newIndexedPage.Url))
                {
                    context.IndexedPages.Add(newIndexedPage);
                    context.SaveChanges();
                }
                else
                {
                    var indexedPage = context.IndexedPages.First(p => p.Url == newIndexedPage.Url);
                    indexedPage.ViewCount            = newIndexedPage.ViewCount;
                    indexedPage.Content              = newIndexedPage.Content;
                    indexedPage.Title                = newIndexedPage.Title;
                    context.Entry(indexedPage).State = EntityState.Modified;
                    context.SaveChanges();
                }
            }
        }
示例#2
0
        protected void IndexNairalandDocument(CQ document, PageToCrawl page)
        {
            if (page.PageUrl.AbsoluteUri.Contains("register") || page.PageUrl.AbsoluteUri.Contains("programming-ads"))
            {
                return;
            }
            var newIndexedPage = new IndexedPage();

            newIndexedPage.Url   = page.PageUrl.AbsoluteUri;
            newIndexedPage.Title = document.Select(".body>h2").Text();
            var viewText = document.Select(".body>.bold").Text().Split('(').LastOrDefault();

            if (viewText == null)
            {
                return;
            }
            viewText = viewText.Split(' ')[0];
            if (viewText.IsInt())
            {
                newIndexedPage.ViewCount = int.Parse(viewText);
            }
            newIndexedPage.Site    = SiteNames.NAIRALAND;
            newIndexedPage.Content = document.Select(".body table div.narrow").Html();
            newIndexedPage.Author  = document.Select(".body table a.user").First().Text();

            //normalize the title
            if (newIndexedPage.Title.EndsWith(" - Programming - Nairaland"))
            {
                newIndexedPage.Title = newIndexedPage.Title.Substring(0,
                                                                      (newIndexedPage.Title.Length - " - Programming - Nairaland".Length));
            }

            SavePageIndex(newIndexedPage);
        }
        //Main Method that calls several Save methods for the content, links and keywords of a page.
        public static int SaveSearchResults(ContentSearchResult searchResults)
        {
            try
            {
                int pageIDAfterInsert = 0;
                //save the  page
                IndexedPage pg = new IndexedPage();

                pg.DateCreated     = DateTime.Now;
                pg.ParentID        = searchResults.ParentID;
                pg.PageName        = GetFileWithFolder(searchResults.PageURL);
                pg.PageURL         = searchResults.PageURL;
                pg.ParentDirectory = searchResults.ParentDirectory;
                pg.IndexedSiteID   = searchResults.IndexedSiteID;
                pg.Title           = searchResults.Title.Length > 50 ? searchResults.Title.Substring(0, 49) : searchResults.Title;
                if (!IsPageAlreadySaved(pg.PageURL, pg.PageName))
                {
                    DB.IndexedPages.Add(pg);
                    DB.SaveChanges();
                    pageIDAfterInsert = pg.PageID;
                }
                else
                {   //the page already exists so add a few missing fields.
                    pg             = GetPageByName(pg.PageURL, pg.PageName);
                    pg.DateCreated = DateTime.Now;

                    pg.Title = searchResults.Title;

                    pageIDAfterInsert = pg.PageID;
                    DB.SaveChanges();
                }



                SaveTheLinks(searchResults, pg); //save the links for this page.


                SaveTheKeywords(searchResults, pg); //save the keywords


                UpdateIsIndexedFlag(pg.PageID);   //update the IsIndexed flag so it is not run again.


                return(pageIDAfterInsert);
            }
            catch (DbEntityValidationException ex)
            { MessageLogger.LogThis(ex);
              return(0); }
            catch (Exception ex)
            {
                MessageLogger.LogThis(ex);
                return(0);
            }
        }
        //Saves the links which are pulled from the HTML of a page.
        public static void SaveTheLinks(ContentSearchResult searchResults, IndexedPage pg)
        {
            List <IndexedPage> linkPages = new List <IndexedPage>();

            try
            { if (searchResults.Links.Count > 10)
              {     //for speed, remove the links so we can see if the rest of the system works .
                  searchResults.Links.RemoveRange(10, searchResults.Links.Count - 10);
              }

              foreach (string singleLink in searchResults.Links)
              {
                  IndexedPage cp = new IndexedPage();
                  if (singleLink.Length > 1)   //it might be only a /
                  {
                      cp.DateCreated   = DateTime.Now;
                      cp.ParentID      = pg.PageID;
                      cp.PageName      = GetFileWithFolder(singleLink);
                      cp.IndexedSiteID = pg.IndexedSiteID;
                      //get directory for the file, not only the filename.
                      cp.ParentDirectory = Services.SearchLibrary.GetDirectoryForFile(singleLink, pg.PageID);
                      cp.PageURL         = GetFullURLFromPartial(cp.PageName, cp.ParentDirectory);
                      cp.Title           = ""; // THIS COMES ONLY FROM THE CONTENT;


                      // code to avoid duplicates.

                      if (IsValidLink(cp.PageURL) && !DBSearchResult.IsPageAlreadySaved(cp.PageURL, cp.PageName))
                      {
                          linkPages.Add(cp);
                      }
                  }
              }

              DB.IndexedPages.AddRange(linkPages);
              DB.SaveChanges(); }
            catch (DbEntityValidationException ex)
            {
                var s = new Exception();


                string data = Services.SerializeIt.SerializeThis(searchResults);

                MessageLogger.LogThis(ex, data);
            }
            catch (Exception ex)
            {
                string data = Services.SerializeIt.SerializeThis(searchResults);
                MessageLogger.LogThis(ex, data);
            }
        }
示例#5
0
        protected void IndexStackOverflowDocument(CQ document, PageToCrawl page)
        {
            try
            {
                var newIndexedPage = new IndexedPage();
                newIndexedPage.Url   = page.PageUrl.AbsoluteUri;
                newIndexedPage.Title = document.Select("#question-header>h1").Text();
                var viewText = document.Select("#qinfo b").Skip(1).Take(1).First().InnerText;
                viewText = viewText.Split(' ')[0];
                if (viewText.IsInt())
                {
                    newIndexedPage.ViewCount = int.Parse(viewText);
                }
                newIndexedPage.Site    = SiteNames.STACKOVERFLOW;
                newIndexedPage.Content = document.Select(".postcell .post-text").Text();
                newIndexedPage.Author  = document.Select(".post-signature .user-info .user-details a").First().Text();

                SavePageIndex(newIndexedPage);
            }
            catch (Exception exception)
            {
                //we will log this for health monitoring
            }
        }
        //save each word and the # of times it occurs, word by word found on a parent page.
        public static void SaveTheKeywords(ContentSearchResult searchResults, IndexedPage pg)
        {
            List <PageKeyWord> keywordRankingList = new List <PageKeyWord>();

            try
            {
                //save the keywords for this page.
                foreach (KeywordRanking kw in searchResults.KeyWordRankingList)
                {
                    PageKeyWord pkw = new PageKeyWord();
                    pkw.PageID       = pg.PageID;
                    pkw.Keyword      = kw.Keyword;
                    pkw.KeywordCount = kw.Rank;
                    keywordRankingList.Add(pkw);
                }
                DB.PageKeyWords.AddRange(keywordRankingList);
                DB.SaveChanges();
            }
            catch (Exception ex)
            {
                string data = Services.SerializeIt.SerializeThis(searchResults.KeyWordRankingList);
                MessageLogger.LogThis(ex, data);
            }
        }