//get the totals by page and by keyword. public static SearchTotal GetIndexedPageTotals(int indexedSiteID) { SearchTotal st = new SearchTotal(); try { var pgCount = (from px in DB.IndexedPages where px.IndexedSiteID == indexedSiteID group px by px.PageURL into gr1 select new { myKey = gr1.Key, mycount = gr1.Count() }).ToList(); st.PagesIndexed = pgCount.Sum(g => g.mycount); var kwCount = (from p in DB.IndexedPages join pkw in DB.PageKeyWords on p.PageID equals pkw.PageID where p.IndexedSiteID == indexedSiteID group p by p.PageName into gp select new { myKey = gp.Key, myKWCount = gp.Count() }).ToList(); st.KeywordsIndexed = kwCount.Sum(c => c.myKWCount); return(st); } catch (Exception ex) { MessageLogger.LogThis(ex); return(null); } }
///group the pages by pageURL and sum up the keyword counts public static List <KeywordRanking> GetKeywordRanking(string keyWord) { try { var results = (from pg in DB.IndexedPages join pgLinks in DB.PageKeyWords on pg.PageID equals pgLinks.PageID where pgLinks.Keyword.Contains(keyWord) || pgLinks.Keyword.StartsWith(keyWord) || null == keyWord group new { pg, pgLinks } by pg.PageURL into grup1 select new KeywordRanking { PageURL = grup1.FirstOrDefault().pg.PageURL, Title = grup1.FirstOrDefault().pg.Title, Rank = grup1.Sum(g => g.pgLinks.KeywordCount.Value) }).ToList(); return(results); } catch (DbEntityValidationException ex) { MessageLogger.LogThis(ex); return(null); } catch (Exception ex) { MessageLogger.LogThis(ex); return(null); } }
//Get the data for a page so it can be indexed. //used for requesting the content based on the URL. public static IndexedPage GetPageByName(string pageURL, string pageName) { try { Uri siteURL = new Uri(pageURL); string domainName = siteURL.GetLeftPart(UriPartial.Authority); var result = (from p in DB.IndexedPages where p.PageName.ToLower() == pageName.ToLower() && p.PageURL.StartsWith(domainName) || p.PageURL == pageURL select p).ToList(); if (result.Any()) { return(result.First()); } else { return(null); } } catch (Exception ex) { MessageLogger.LogThis(ex); return(null); } }
//Main Method that calls several Save methods for the content, links and keywords of a page. public static int SaveSearchResults(ContentSearchResult searchResults) { try { int pageIDAfterInsert = 0; //save the page IndexedPage pg = new IndexedPage(); pg.DateCreated = DateTime.Now; pg.ParentID = searchResults.ParentID; pg.PageName = GetFileWithFolder(searchResults.PageURL); pg.PageURL = searchResults.PageURL; pg.ParentDirectory = searchResults.ParentDirectory; pg.IndexedSiteID = searchResults.IndexedSiteID; pg.Title = searchResults.Title.Length > 50 ? searchResults.Title.Substring(0, 49) : searchResults.Title; if (!IsPageAlreadySaved(pg.PageURL, pg.PageName)) { DB.IndexedPages.Add(pg); DB.SaveChanges(); pageIDAfterInsert = pg.PageID; } else { //the page already exists so add a few missing fields. pg = GetPageByName(pg.PageURL, pg.PageName); pg.DateCreated = DateTime.Now; pg.Title = searchResults.Title; pageIDAfterInsert = pg.PageID; DB.SaveChanges(); } SaveTheLinks(searchResults, pg); //save the links for this page. SaveTheKeywords(searchResults, pg); //save the keywords UpdateIsIndexedFlag(pg.PageID); //update the IsIndexed flag so it is not run again. return(pageIDAfterInsert); } catch (DbEntityValidationException ex) { MessageLogger.LogThis(ex); return(0); } catch (Exception ex) { MessageLogger.LogThis(ex); return(0); } }
//Saves the links which are pulled from the HTML of a page. public static void SaveTheLinks(ContentSearchResult searchResults, IndexedPage pg) { List <IndexedPage> linkPages = new List <IndexedPage>(); try { if (searchResults.Links.Count > 10) { //for speed, remove the links so we can see if the rest of the system works . searchResults.Links.RemoveRange(10, searchResults.Links.Count - 10); } foreach (string singleLink in searchResults.Links) { IndexedPage cp = new IndexedPage(); if (singleLink.Length > 1) //it might be only a / { cp.DateCreated = DateTime.Now; cp.ParentID = pg.PageID; cp.PageName = GetFileWithFolder(singleLink); cp.IndexedSiteID = pg.IndexedSiteID; //get directory for the file, not only the filename. cp.ParentDirectory = Services.SearchLibrary.GetDirectoryForFile(singleLink, pg.PageID); cp.PageURL = GetFullURLFromPartial(cp.PageName, cp.ParentDirectory); cp.Title = ""; // THIS COMES ONLY FROM THE CONTENT; // code to avoid duplicates. if (IsValidLink(cp.PageURL) && !DBSearchResult.IsPageAlreadySaved(cp.PageURL, cp.PageName)) { linkPages.Add(cp); } } } DB.IndexedPages.AddRange(linkPages); DB.SaveChanges(); } catch (DbEntityValidationException ex) { var s = new Exception(); string data = Services.SerializeIt.SerializeThis(searchResults); MessageLogger.LogThis(ex, data); } catch (Exception ex) { string data = Services.SerializeIt.SerializeThis(searchResults); MessageLogger.LogThis(ex, data); } }
//clears the AppLogs table. public static void ClearEventLog() { try { var msgs = DB.AppLogs.ToList(); foreach (var item in msgs) { DB.AppLogs.Remove(item); DB.SaveChanges(); } } catch (Exception ex) { MessageLogger.LogThis(ex); } }
//has the page been saved alread? //A link might have been inserted already. This avoids duplicates. public static bool IsPageAlreadySaved(string pageURL, string pageName) { try { Uri siteURL = new Uri(pageURL); string domainName = siteURL.GetLeftPart(UriPartial.Authority); var result = (from p in DB.IndexedPages where p.PageName.ToLower() == pageName.ToLower() && p.PageURL.StartsWith(domainName) || p.PageURL == pageURL select p).ToList(); return(result.Any()); } catch (Exception ex) { MessageLogger.LogThis(ex); return(true); } }
//some links are on the same page or is only the domain page..skip these. public static bool IsValidLink(string pageURL) { // if the url is too short //or is the same as the domain this will throw an error //and it can be skipped. try { Uri siteURL = new Uri(pageURL); string domainName = siteURL.GetLeftPart(UriPartial.Authority); if (pageURL.StartsWith("#")) { return(false); } } catch (Exception ex) { MessageLogger.LogThis(ex); return(false); } return(true); }
//save each word and the # of times it occurs, word by word found on a parent page. public static void SaveTheKeywords(ContentSearchResult searchResults, IndexedPage pg) { List <PageKeyWord> keywordRankingList = new List <PageKeyWord>(); try { //save the keywords for this page. foreach (KeywordRanking kw in searchResults.KeyWordRankingList) { PageKeyWord pkw = new PageKeyWord(); pkw.PageID = pg.PageID; pkw.Keyword = kw.Keyword; pkw.KeywordCount = kw.Rank; keywordRankingList.Add(pkw); } DB.PageKeyWords.AddRange(keywordRankingList); DB.SaveChanges(); } catch (Exception ex) { string data = Services.SerializeIt.SerializeThis(searchResults.KeyWordRankingList); MessageLogger.LogThis(ex, data); } }