Ejemplo n.º 1
0
        /// <summary>
        /// Main Indexing process starts here.
        /// Step 1: Create an IndexedSiteID for grouping all the upcoming pages
        /// Step 2: Make Recursive call to DoPageIndexing. This will loop on itself.
        ///
        /// </summary>
        /// <param name="pageName"></param>
        /// <returns></returns>
        public JsonResult startPageIndexProcess(string pageName)
        {
            try
            {
                NUMBER_OF_LEVELS = Int16.Parse(ConfigurationManager.AppSettings["HowManyLevels"]);
                string Folder     = SearchLibrary.GetDirectoryForFile(pageName, -1);
                string actualPage = System.IO.Path.GetFileName(pageName);

                //create a record to serve as a groupID  for the site or group of pages to index.
                int siteIndexID = DBSearchResult.GetNewSiteIndex(Folder, actualPage);

                //now save the first page so that the parallel functions have links to use.
                ContentSearchResult csr = SearchLibrary.LoadPageContent(pageName, -1, siteIndexID);
                SearchLibrary.GetLinksAndKeywords(csr);
                csr.PageID = DBSearchResult.SaveSearchResults(csr);

                //now everything is ready to run in a loop until all pages have been indexed.

                return(doPageIndexing(-1, siteIndexID));
            }
            catch (Exception ex)
            {
                MessageLogger.LogThis(ex);
                //Run query to return results.
            }
            return(null);
        }
        /// <summary>GetLinksAndKeywords
        ///  the content of the page is loaded. So extract the links and text from a single page.
        /// //then load all of them into the main container= ContentSearchResult
        ///Main Object-ContentSearchResult: Container object for all the properties.
        /////This method loads and then passes the container to the save method later.
        //Extracts links, title, and converts html to text content
        /// then counts up the keywords from the content.
        /// </summary>
        /// <param name="ContentSearchResult"></param>

        public static void GetLinksAndKeywords(ContentSearchResult sr)
        {
            //check if this page has been indexed BEFORE getting the content.
            try
            {
                if (!DBSearchResult.IsPageContentIndexed(sr.PageURL, sr.PageName))
                {
                    sr.Title           = GetPageTitle(sr.SearchContent, sr.PageName);
                    sr.ParentDirectory = GetDirectoryForFile(sr.PageURL, sr.ParentID);
                    sr.PageURL         = sr.PageURL;
                    sr.TextContent     = GetTextFromHTML(sr.SearchContent);

                    //use the full page content to extract the links
                    sr.Links = GetLinks(sr.SearchContent);

                    //use ONLY the cleaned text to find the keyword ranking.
                    sr.KeyWordRankingList = GetKeywordCounts(sr.TextContent);
                }
            }
            catch (DbEntityValidationException ex)
            {
                string data = Services.SerializeIt.SerializeThis(sr);
                MessageLogger.LogThis(ex, data);
            }
            catch (Exception ex)
            {
                MessageLogger.LogThis(ex);
            }
        }
        public static ContentSearchResult LoadPageContent(string pageURL, int parentID, int siteIndexID)
        {
            ContentSearchResult searchResult = null;

            searchResult = new ContentSearchResult();
            //check if this page has been indexed BEFORE getting the content.
            try
            {
                searchResult.ParentID      = parentID;
                searchResult.PageName      = GetFilenameFromURL(pageURL);
                searchResult.IndexedSiteID = siteIndexID;
                searchResult.PageURL       = pageURL;

                if (!DBSearchResult.IsPageContentIndexed(pageURL, searchResult.PageName))
                {
                    searchResult.SearchContent = GetPageContent(pageURL);
                }
                return(searchResult);
            }

            catch (AggregateException ex)
            {
                MessageLogger.LogThis(ex);
                return(searchResult);
            }
            catch (Exception ex)
            {
                MessageLogger.LogThis(ex);
                return(searchResult);
            }
        }
Ejemplo n.º 4
0
        /// <summary>
        /// This is the main workhorse which runs recursively.
        /// It will stop once the GoneFarEnough returns a true value for the LimitReached.
        /// LimitReached is in the webconfig. It controls the # of levels to walk/traverse.
        /// The pageName is the current url to index
        /// The ParentID is the ID of the page which contains the link.
        /// The SiteIndexID is the ID assigned to the site or group of related pages which is being indexed
        /// </summary>
        /// <param name="pageName"></param>
        /// <param name="parentID"></param>
        /// <param name="siteIndexID"></param>
        /// <returns></returns>
        public JsonResult doPageIndexing(int parentID, int siteIndexID)
        {
            SearchTotal finalCount;

            try
            {
                //this method runs recursively until the limit is reached.
                ConcurrentBag <ContentSearchResult> searchResults = new ConcurrentBag <ContentSearchResult>();
                // get the links from the saved links
                bool limitReached = DBSearchResult.GoneFarEnough(NUMBER_OF_LEVELS, siteIndexID);
                if (!limitReached)
                {
                    List <LinkedPageData> pageLinksMain = DBSearchResult.GetLinkDataForSiteIndexID(siteIndexID);

                    //put the links into a list so that they can be run in Parallel.
                    Parallel.ForEach(pageLinksMain, (sr) =>
                    {
                        string fullURL          = string.Join("", sr.PageDirectory, sr.PageName);
                        ContentSearchResult csr = SearchLibrary.LoadPageContent(fullURL, sr.ParentID, siteIndexID);
                        searchResults.Add(csr);
                    });

                    // now that all the links have content, do a regular loop for the parsing and saving .
                    foreach (ContentSearchResult csr in searchResults)
                    {
                        SearchLibrary.GetLinksAndKeywords(csr);
                        csr.PageID = DBSearchResult.SaveSearchResults(csr);
                        doPageIndexing(csr.PageID, siteIndexID);
                    }
                }
            }
            catch (DbEntityValidationException ex)
            {
                MessageLogger.LogThis(ex);
                Server.ClearError();
            }
            catch (Exception ex)
            {
                MessageLogger.LogThis(ex);
                Server.ClearError();
            }
            finally
            {
                finalCount = DBSearchResult.GetIndexedPageTotals(siteIndexID);
            }

            return(Json(finalCount, JsonRequestBehavior.AllowGet));
        }
        //Get the parent folder of a page.
        //if it has no HTTP we need to find the parent from the parent
        // of the first page which is in the DB
        public static string GetDirectoryForFile(string pageURL, int parentID)
        {
            string fixedURL = RemoveEndingSlash(pageURL);

            if (!fixedURL.Contains("http"))
            {
                //retrieve the path from the database.
                LinkedPageData pg = DBSearchResult.GetPageInfo(parentID);
                return(pg.PageDirectory);
            }
            else
            {
                //the URL might contain only a # or only the domain name.
                //handle the case where it is a root folder.
                Regex  domainMatch;
                string protocolPart = @"^(http(s)?(:\/\/))?(www\.)?";
                string domainPart   = @"[a-zA-Z0-9-_\.]+";
                string paramsPart   = @"/([-a-zA-Z0-9:%_\+.~#?&//=]*)/";
                string fullURLMatch = string.Join("", protocolPart, domainPart, paramsPart);
                domainMatch = new Regex(fullURLMatch);
                string leftOver = domainMatch.Replace(fixedURL, "");
                if (leftOver.Length < 2)
                {
                    return(fixedURL);
                }
                else
                {
                    var    myRequest = new Uri(fixedURL);
                    string lastPart  = myRequest.Segments.Last() + myRequest.Query;

                    string parentFolder = fixedURL.Replace(lastPart, "");

                    return(parentFolder);
                }
            }
        }
Ejemplo n.º 6
0
        //retrieve application log for debugging.
        public JsonResult GetAppLog()
        {
            List <AppLogVM> LogResults = DBSearchResult.GetAppLog();

            return(Json(LogResults, JsonRequestBehavior.AllowGet));
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Search for the keyword in all pages.
        ///
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns> returns  list Keyword Ranking by page and # of occurences with link to each.
        /// </returns>
        public JsonResult RunSearch(string keyword)
        {
            List <KeywordRanking> rankingList = DBSearchResult.GetKeywordRanking(keyword);

            return(Json(rankingList, JsonRequestBehavior.AllowGet));
        }
Ejemplo n.º 8
0
 //clear application log of all messages.
 public JsonResult ClearAppLog()
 {
     DBSearchResult.ClearEventLog();
     return(Json("App Log Cleared", JsonRequestBehavior.AllowGet));
 }