private static void SavePageIndex(IndexedPage newIndexedPage) { using (var context = new DatabaseContext()) { //truncate the content if (newIndexedPage.Content.Length > 200) { newIndexedPage.Content = newIndexedPage.Content.Substring(0, 200); } if (!context.IndexedPages.Any(p => p.Url == newIndexedPage.Url)) { context.IndexedPages.Add(newIndexedPage); context.SaveChanges(); } else { var indexedPage = context.IndexedPages.First(p => p.Url == newIndexedPage.Url); indexedPage.ViewCount = newIndexedPage.ViewCount; indexedPage.Content = newIndexedPage.Content; indexedPage.Title = newIndexedPage.Title; context.Entry(indexedPage).State = EntityState.Modified; context.SaveChanges(); } } }
protected void IndexNairalandDocument(CQ document, PageToCrawl page) { if (page.PageUrl.AbsoluteUri.Contains("register") || page.PageUrl.AbsoluteUri.Contains("programming-ads")) { return; } var newIndexedPage = new IndexedPage(); newIndexedPage.Url = page.PageUrl.AbsoluteUri; newIndexedPage.Title = document.Select(".body>h2").Text(); var viewText = document.Select(".body>.bold").Text().Split('(').LastOrDefault(); if (viewText == null) { return; } viewText = viewText.Split(' ')[0]; if (viewText.IsInt()) { newIndexedPage.ViewCount = int.Parse(viewText); } newIndexedPage.Site = SiteNames.NAIRALAND; newIndexedPage.Content = document.Select(".body table div.narrow").Html(); newIndexedPage.Author = document.Select(".body table a.user").First().Text(); //normalize the title if (newIndexedPage.Title.EndsWith(" - Programming - Nairaland")) { newIndexedPage.Title = newIndexedPage.Title.Substring(0, (newIndexedPage.Title.Length - " - Programming - Nairaland".Length)); } SavePageIndex(newIndexedPage); }
//Main Method that calls several Save methods for the content, links and keywords of a page. public static int SaveSearchResults(ContentSearchResult searchResults) { try { int pageIDAfterInsert = 0; //save the page IndexedPage pg = new IndexedPage(); pg.DateCreated = DateTime.Now; pg.ParentID = searchResults.ParentID; pg.PageName = GetFileWithFolder(searchResults.PageURL); pg.PageURL = searchResults.PageURL; pg.ParentDirectory = searchResults.ParentDirectory; pg.IndexedSiteID = searchResults.IndexedSiteID; pg.Title = searchResults.Title.Length > 50 ? searchResults.Title.Substring(0, 49) : searchResults.Title; if (!IsPageAlreadySaved(pg.PageURL, pg.PageName)) { DB.IndexedPages.Add(pg); DB.SaveChanges(); pageIDAfterInsert = pg.PageID; } else { //the page already exists so add a few missing fields. pg = GetPageByName(pg.PageURL, pg.PageName); pg.DateCreated = DateTime.Now; pg.Title = searchResults.Title; pageIDAfterInsert = pg.PageID; DB.SaveChanges(); } SaveTheLinks(searchResults, pg); //save the links for this page. SaveTheKeywords(searchResults, pg); //save the keywords UpdateIsIndexedFlag(pg.PageID); //update the IsIndexed flag so it is not run again. return(pageIDAfterInsert); } catch (DbEntityValidationException ex) { MessageLogger.LogThis(ex); return(0); } catch (Exception ex) { MessageLogger.LogThis(ex); return(0); } }
//Saves the links which are pulled from the HTML of a page. public static void SaveTheLinks(ContentSearchResult searchResults, IndexedPage pg) { List <IndexedPage> linkPages = new List <IndexedPage>(); try { if (searchResults.Links.Count > 10) { //for speed, remove the links so we can see if the rest of the system works . searchResults.Links.RemoveRange(10, searchResults.Links.Count - 10); } foreach (string singleLink in searchResults.Links) { IndexedPage cp = new IndexedPage(); if (singleLink.Length > 1) //it might be only a / { cp.DateCreated = DateTime.Now; cp.ParentID = pg.PageID; cp.PageName = GetFileWithFolder(singleLink); cp.IndexedSiteID = pg.IndexedSiteID; //get directory for the file, not only the filename. cp.ParentDirectory = Services.SearchLibrary.GetDirectoryForFile(singleLink, pg.PageID); cp.PageURL = GetFullURLFromPartial(cp.PageName, cp.ParentDirectory); cp.Title = ""; // THIS COMES ONLY FROM THE CONTENT; // code to avoid duplicates. if (IsValidLink(cp.PageURL) && !DBSearchResult.IsPageAlreadySaved(cp.PageURL, cp.PageName)) { linkPages.Add(cp); } } } DB.IndexedPages.AddRange(linkPages); DB.SaveChanges(); } catch (DbEntityValidationException ex) { var s = new Exception(); string data = Services.SerializeIt.SerializeThis(searchResults); MessageLogger.LogThis(ex, data); } catch (Exception ex) { string data = Services.SerializeIt.SerializeThis(searchResults); MessageLogger.LogThis(ex, data); } }
protected void IndexStackOverflowDocument(CQ document, PageToCrawl page) { try { var newIndexedPage = new IndexedPage(); newIndexedPage.Url = page.PageUrl.AbsoluteUri; newIndexedPage.Title = document.Select("#question-header>h1").Text(); var viewText = document.Select("#qinfo b").Skip(1).Take(1).First().InnerText; viewText = viewText.Split(' ')[0]; if (viewText.IsInt()) { newIndexedPage.ViewCount = int.Parse(viewText); } newIndexedPage.Site = SiteNames.STACKOVERFLOW; newIndexedPage.Content = document.Select(".postcell .post-text").Text(); newIndexedPage.Author = document.Select(".post-signature .user-info .user-details a").First().Text(); SavePageIndex(newIndexedPage); } catch (Exception exception) { //we will log this for health monitoring } }
//save each word and the # of times it occurs, word by word found on a parent page. public static void SaveTheKeywords(ContentSearchResult searchResults, IndexedPage pg) { List <PageKeyWord> keywordRankingList = new List <PageKeyWord>(); try { //save the keywords for this page. foreach (KeywordRanking kw in searchResults.KeyWordRankingList) { PageKeyWord pkw = new PageKeyWord(); pkw.PageID = pg.PageID; pkw.Keyword = kw.Keyword; pkw.KeywordCount = kw.Rank; keywordRankingList.Add(pkw); } DB.PageKeyWords.AddRange(keywordRankingList); DB.SaveChanges(); } catch (Exception ex) { string data = Services.SerializeIt.SerializeThis(searchResults.KeyWordRankingList); MessageLogger.LogThis(ex, data); } }