public T Update(ContentCrawlProxy parameters)
        {
            var doc = CreateSearchDoc(parameters, null);

            SearchClient.Post(doc);
            SearchClient.Commit();
            SearchClient.Close();

            return(doc);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="asset"></param>
        /// <returns></returns>
        public T Update(ISearchableAsset asset)
        {
            var parameters = new ContentCrawlProxy()
            {
                ContentItem = asset
            };
            var doc = CreateSearchDoc(parameters, null);

            SearchClient.Post(doc);
            SearchClient.Commit();
            SearchClient.Close();

            return(doc);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="contentItem"></param>
        /// <returns></returns>
        public T Update(ContentCrawlProxy contentItem)
        {
            var doc = CreateSearchDoc(contentItem);

            if (contentItem.ContentItem.NotSearchable)
            {
                SearchClient.DeleteById(doc.id);
            }
            else
            {
                SearchClient.Post(doc);
            }

            SearchClient.Commit();
            SearchClient.Close();

            return(doc);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="contentItem"></param>
        /// <returns></returns>
        private T CreateSearchDoc(ContentCrawlProxy contentItem)
        {
            if (contentItem == null)
            {
                return(default(T));
            }

            var doc = (T)Activator.CreateInstance(typeof(T), new object[] { });

            doc.sourceid = _sourceId;

            var docProps = doc.GetType().GetProperties();

            if (contentItem.ContentItem != null)
            {
                doc.id = contentItem.ContentItem._ContentID;

                var contentBaseTypes = ReflectionUtil.GetBaseTypes(contentItem.ContentItem);

                foreach (var bType in contentBaseTypes)
                {
                    GetBaseProperties(contentItem.ContentItem, doc, docProps, bType);
                }
            }

            if (contentItem.Content != null && contentItem.Content.Any())
            {
                foreach (var crawlPropContent in contentItem.Content)
                {
                    var docProp = docProps.FirstOrDefault(p => p.Name == crawlPropContent.Name);

                    if (docProp != null)
                    {
                        SetPropertyValue(doc, docProp, crawlPropContent.Value);
                    }
                }
            }

            var docContent = doc.content != null?HtmlParser.StripHTML(string.Join(" ", doc.content)) : "";

            doc.highlightsummary = (HtmlParser.StripHTML(doc.summary) + " " + docContent + " " + doc.title).Trim();

            return(doc);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="parameters"></param>
        /// <param name="results"></param>
        /// <returns></returns>
        private T CreateSearchDoc(ContentCrawlProxy parameters, IndexResults results)
        {
            var doc = (T)Activator.CreateInstance(typeof(T), new object[] { });

            doc.id = parameters.ContentItem._ContentID;
            //doc.title = parameters.ContentItem.Name;
            doc.sourceid = _sourceId;

            var docProps = doc.GetType().GetProperties();

            //var pageCrawlProps = parameters.CrawlProperties as ContentCrawlProxy;

            // load crawl properties
            if (parameters.Content != null && parameters.Content.Any())
            {
                foreach (var crawlPropContent in parameters.Content)
                {
                    var docProp = docProps.FirstOrDefault(p => p.Name == crawlPropContent.Name);

                    if (docProp != null)
                    {
                        SetPropertyValue(doc, docProp, crawlPropContent.Value);
                    }
                }
            }

            var pageBaseTypes = new List <Type>();

            var baseType = parameters.GetType().BaseType;

            while (baseType != null)
            {
                pageBaseTypes.Add(baseType);
                baseType = baseType.BaseType;
            }

            pageBaseTypes.Reverse();

            foreach (var bType in pageBaseTypes)
            {
                GetBaseProperties(parameters.ContentItem, doc, docProps, bType);
            }


            if (!((ISearchableAsset)parameters.ContentItem).DisableExtract)
            {
                try
                {
                    var blob = ((ISearchableAsset)parameters.ContentItem).AssetBlob;

                    if (blob != null && blob.Length <= Threshold)
                    {
                        //doc = SearchClient.Extract(doc, blob);
                        var responseXml = SearchClient.FileExtract(blob);

                        var xmlParser  = new XmlParser(responseXml);
                        var xhtml      = xmlParser.ParseHTML("/response/str");
                        var htmlParser = new HtmlParser(WebUtility.HtmlDecode(xhtml));

                        //doc.mimetype = xmlParser.ParseString("/response/lst/arr[@name='Content-Type']/str");
                        //var pubdate = xmlParser.ParseDate("/response/lst/arr[@name='Creation-Date']/str");

                        //if (pubdate != null)
                        //   doc.timestamp = pubdate.Value;

                        if (doc.content == null)
                        {
                            doc.content = new List <string>();
                        }

                        doc.content.Add(WebUtility.HtmlEncode(htmlParser.ParseStripInnerHtml("//body")));
                    }
                }
                catch (Exception ex)
                {
                    LogWarning(string.Format("Extraction failed for ID: {0} NAME:{1}. {2}", parameters.ContentItem._ContentID, parameters.ContentItem.Name, ex.Message));

                    if (results != null)
                    {
                        results.WarningCnt++;
                    }
                }
            }


            var docContent = doc.content != null?HtmlParser.StripHTML(string.Join(" ", doc.content)) : "";

            doc.highlightsummary = (HtmlParser.StripHTML(doc.summary) + " " + docContent + " " + doc.title).Trim();

            return(doc);
        }
Ejemplo n.º 6
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="id"></param>
        /// <param name="crawlProperties"></param>
        /// <returns></returns>
        public string CreateSearchJsonDoc(string id, ContentCrawlProxy crawlProperties)
        {
            if (crawlProperties == null)
            {
                return(null);
            }

            crawlProperties.Content.Insert(0, new CrawlerContent()
            {
                Value = id,
                Name  = "id",
            });

            crawlProperties.Content.Insert(1, new CrawlerContent()
            {
                Value = _sourceId,
                Name  = "sourceid",
            });

            var dict = new Dictionary <string, object>();

            var propCnts = from x in crawlProperties.Content
                           group x by x.Name into g
                           let count = g.Count()
                                       orderby count descending
                                       select new { Name = g.Key, Count = count };

            foreach (var propCnt in propCnts)
            {
                if (propCnt.Count > 1)
                {
                    var values = crawlProperties.Content
                                 .Where(p => p.Name == propCnt.Name && p.Value != null)
                                 .Select(p => p.Value)
                                 .ToList();

                    dict.Add(propCnt.Name, values);
                }
                else
                {
                    var crawlProp = crawlProperties.Content.FirstOrDefault(p => p.Name == propCnt.Name);

                    if (crawlProp != null && crawlProp.Value != null)
                    {
                        var isEnumerable = crawlProp.Value as IList;

                        if (isEnumerable != null)
                        {
                            if (isEnumerable.Count > 0)
                            {
                                dict.Add(propCnt.Name, crawlProp.Value);
                            }
                        }
                        else
                        {
                            dict.Add(propCnt.Name, crawlProp.Value);
                        }
                    }
                }
            }

            var settings = new JsonSerializerSettings
            {
                DateFormatString     = "yyyy-MM-ddTHH:mm:ssZ",
                DateTimeZoneHandling = DateTimeZoneHandling.Local
            };

            return(JsonConvert.SerializeObject(dict, settings));
        }