Ejemplo n.º 1
0
        public List <Item> indexSitecoreContent()
        {
            // ID blogTemplateId = new Sitecore.Data.ID("{48587868-4A19-48BB-8FC9-F1DD31CB6C8E}");
            //var index = Sitecore.ContentSearch.ContentSearchManager.GetIndex("sitecore_master_index");
            //List<Item> results = new List<Item>();

            //using (Sitecore.ContentSearch.IProviderSearchContext context = index.CreateSearchContext())
            //{
            //    //var searchResults = context.GetQueryable<SearchResultItem>().Where(x => x.Content.Contains("Sitecore"));
            //    var searchResults = context.GetQueryable<SearchResultItem>().Take(10);
            //    results = (List<Item>)searchResults;
            //}



            #region Indexing

            // Get the indexes files from the "lucene_Index" folder
            string[] filePaths = System.IO.Directory.GetFiles(_luceneDir);

            // Delete all the indexes from "lucene_Index" folder
            foreach (string filePath in filePaths)
            {
                File.Delete(filePath);
            }


            //Create Directory for Indexes
            //There are 2 options, FS or RAM
            //Step 1: Declare Index Store


            //Now we need Analyzer
            //An Analyzer builds TokenStreams, which analyze text. It thus represents a policy for extracting index terms from text.
            //In general, any analyzer in Lucene is tokenizer + stemmer + stop-words filter.
            //Tokenizer splits your text into chunks-For example, for phrase "I am very happy" it will produce list ["i", "am", "very", "happy"]
            // stemmer:-piece of code responsible for “normalizing” words to their common form (horses => horse, indexing => index, etc)
            //Stop words are the most frequent and almost useless words
            Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);

            //Need an Index Writer to write the output of Analysis to Index
            IndexWriter writer = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);

            // Get the data to index for search
            Item ExternalLinksFolder = Sitecore.Context.Database.GetItem("{6BD289C4-A964-4915-A769-A79C897D1746}");

            string htmlContent = string.Empty;

            // string[] ExternalLinksHtmlContent = new string[ExternalLinksFolder.Children.Count];
            // string ExternalPageHtmlContent;
            int count = 0;
            Sitecore.Data.Fields.ImageField imageField = null;
            SitecoreFields.LinkField        linkfield  = null;
            string externalurl;
            // web page declaring
            var htmlWeb = new HtmlWeb();
            HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
            string webPageTitle;
            string webPageContent;


            // end declaring
            foreach (Item externalLinkItem in ExternalLinksFolder.Children)
            {
                Document doc = new Document();
                linkfield   = externalLinkItem.Fields["ExternalURL"];
                externalurl = linkfield.Url;

                imageField = externalLinkItem.Fields["Image"];

                // webpage Title field
                LargestImage largestImage = new LargestImage();
                HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument();
                htmlDocument   = htmlWeb.Load(externalurl);
                webPageTitle   = GetWebPageTitle(htmlDocument);
                webPageContent = GetPageContent(htmlDocument);
                largestImage   = GetLargestWebPageSec(htmlDocument, externalurl);
                // image field
                if (largestImage.imgSrc == "") // populate image attributes from sitecore image field if there is no image found on the external web page
                {
                    largestImage.imgSrc = Sitecore.Resources.Media.MediaManager.GetMediaUrl(imageField.MediaItem);
                    largestImage.imgAlt = imageField.Alt;
                }

                if (webPageTitle == "") // populate title from sitecore if webpage has no title or h1 tags
                {
                    webPageTitle = externalLinkItem.Fields["Title"].Value;
                }

                if (webPageContent == "")
                {
                    webPageContent = externalLinkItem.Fields["PageContent"].Value;
                }

                // end image field
                doc.Add(new Field("External_WebPage_Content", webPageContent, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new Field("External_WebPage_Title", webPageTitle, Field.Store.YES, Field.Index.ANALYZED));
                doc.Add(new Field("External_WebPage_Url", externalurl, Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("External_WebPage_ImageSrc", largestImage.imgSrc, Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("External_WebPage_ImageAlt", largestImage.imgAlt, Field.Store.YES, Field.Index.NOT_ANALYZED));

                writer.AddDocument(doc);
                count++;
            }

            writer.Optimize();
            writer.Commit();
            writer.Dispose();
            #endregion

            return(results);
        }
Ejemplo n.º 2
0
        private static LargestImage GetLargestWebPageSec(HtmlDocument doc, string webPageUrl)
        {
            // for getting domain from the url
            var    url = new Uri(webPageUrl);
            int    protocalAndHostnameLength = url.AbsoluteUri.Count() - url.AbsolutePath.Count();
            string protocalAndHostname       = url.ToString().Substring(0, protocalAndHostnameLength);
            // end domain
            //  request for images
            WebClient webClient = new WebClient();

            byte[]       imageBytes   = null;
            MemoryStream memoryStream = null;

            System.Drawing.Image image;
            LargestImage         largestImage = new LargestImage();
            string imageURL = "";
            // end request
            int count = 0;


            // Now, using LINQ to get all Images from the webpage
            List <HtmlNode> imageNodes = null;

            if (doc.DocumentNode.SelectNodes("//img") != null)
            {
                imageNodes = (from HtmlNode node in doc.DocumentNode.SelectNodes("//img")
                              where node.Name == "img"
                              select node).ToList();


                foreach (HtmlNode node in imageNodes)
                {
                    if (!string.IsNullOrEmpty(node.Attributes["src"].Value) && !node.Attributes["src"].Value.EndsWith(".gif"))
                    {
                        if (node.Attributes["src"].Value.StartsWith("h"))
                        {
                            imageURL = node.Attributes["src"].Value;
                        }
                        else
                        {
                            imageURL = (node.Attributes["src"].Value.StartsWith("/")) ? protocalAndHostname + node.Attributes["src"].Value : protocalAndHostname + "/" + node.Attributes["src"].Value;
                        }

                        imageBytes = webClient.DownloadData(imageURL);

                        if (imageBytes.Count() > 0)
                        {
                            memoryStream = new MemoryStream(imageBytes);
                            image        = System.Drawing.Image.FromStream(memoryStream);

                            if (image.Height > 0 && image.Width > 0)
                            {
                                if (count == 0)
                                {
                                    largestImage.imgSrc       = imageURL;
                                    largestImage.imgDimension = image.Width * image.Height;
                                    if (node.Attributes["alt"] != null)
                                    {
                                        largestImage.imgAlt = node.Attributes["alt"].Value;
                                    }
                                }

                                if ((image.Width * image.Height) > largestImage.imgDimension)
                                {
                                    largestImage.imgSrc       = imageURL;
                                    largestImage.imgDimension = image.Width * image.Height;
                                    if (node.Attributes["alt"] != null)
                                    {
                                        largestImage.imgAlt = node.Attributes["alt"].Value;
                                    }
                                }
                            }
                        }

                        count++;
                    }
                }
            }


            return(largestImage);
        }