Exemplo n.º 1
0
        public int GetRelevance(Item item, string keyword)
        {
            var titleContainsKeywordScore = _programOptions.TitleContainsKeywordScore;
            var titleStartsWithKeywordScore = _programOptions.TitleStartsWithKeywordScore;
            var contentContainsKeywordScore = _programOptions.ContentContainsKeywordScore;
            var contentFirst100ContainsKeywordScore = _programOptions.ContentFirst100ContainsKeywordScore;
            var keywordRatioScore = _programOptions.KeywordRatioScore;

            var score = 0;
            if (Regex.IsMatch(item.Title, keyword, RegexOptions.IgnoreCase))
            {
                score += titleContainsKeywordScore;
            }
            if (item.Title.StartsWith(keyword, true, CultureInfo.InvariantCulture))
            {
                score += titleStartsWithKeywordScore;
            }

            if (!string.IsNullOrEmpty(item.Content) && Regex.IsMatch(item.Content, keyword, RegexOptions.IgnoreCase))
            {
                score += contentContainsKeywordScore;
            }
            if (!string.IsNullOrEmpty(item.Content) && item.Content.Length > 100 && Regex.IsMatch(item.Content.Substring(0, 100), keyword, RegexOptions.IgnoreCase))
            {
                score += contentFirst100ContainsKeywordScore;
            }
            if (KeywordRatio(item, keyword) >= 1)
            {
                score += keywordRatioScore;
            }

            score += GetRelevanceForNonExactMatch(item, keyword);

            if (item.Site == "Bonanza")
            {
                var imageCount = item.Content.ToLower()
                    .Split(new string[] { "<img " }, StringSplitOptions.RemoveEmptyEntries);
                var cf = new ConverterFunctions();
                var stripped = cf.StripTags(item.Content, new List<string>() { "font", "p", "span", "div", "h2", "h3", "h4", "tr", "td" });
                int rate = (int)((stripped.Length / (double)item.Content.Length) * 100);
                rate -= imageCount.Length * 5;
                if (rate < 10) score = -1;
            }

            return score;
        }
Exemplo n.º 2
0
        public virtual Item.Item GetItem(string title, string url, string extraInfo)
        {
            var item = new Item.Item()
                           {
                               Tags = new List<string>(),
                               ItemImages = new List<ItemImage>(),
                               Url = url//,
                               //Keyword = keyword
                           };

            string itemHtml = "";
            var maxTry = 3;
            var tryCount = 0;
            while (tryCount < maxTry)
            {
                if (_options.UseProxy)
                {
                    itemHtml = WebHelper.CurlSimple(url, "text/html",
                        new WebProxy(_options.ProxyAddress + ":" + _options.ProxyPort));
                }
                else
                {
                    itemHtml = WebHelper.CurlSimple(url);
                }
                if (!string.IsNullOrEmpty(itemHtml))
                {
                    break;
                }
                Thread.Sleep(TimeSpan.FromSeconds(3));
                tryCount++;
            }

            if (itemHtml == null) return null;
            var htmlDoc = new HtmlDocument();
            htmlDoc.LoadHtml(itemHtml);

            var metaDescription = htmlDoc.DocumentNode.SelectSingleNode(MetaDescriptionXPath);
            if (metaDescription != null)
            {
                item.MetaDescription = metaDescription.Attributes["content"].Value;
            }

            var tags = htmlDoc.DocumentNode.SelectNodes(TagsXPath);
            if (tags != null)
            {
                foreach (var tag in tags)
                {
                    item.Tags.Add(tag.InnerText);
                }
            }

            var images = htmlDoc.DocumentNode.SelectNodes(ImagesXPath);
            if (images != null)
            {
                foreach (var image in images)
                {

                    var imageUrl = image.Attributes[ImagesAttribute] == null ? "" : image.Attributes[ImagesAttribute].Value;
                    if (string.IsNullOrEmpty(imageUrl))
                    {
                        imageUrl = image.Attributes[AlternativeImagesAttribute] == null ? "" : image.Attributes[AlternativeImagesAttribute].Value;
                        if (string.IsNullOrEmpty(imageUrl))
                        {
                            continue;
                        }
                    }
                    if (!imageUrl.StartsWith("http://") && !imageUrl.StartsWith("https://"))
                    {
                        imageUrl = "http:" + (imageUrl.StartsWith("//") ? "" : "//") + imageUrl;
                    }
                    item.ItemImages.Add(new ItemImage() { OriginalSource = imageUrl, Primary = true });
                }
            }

            var metaPrice = htmlDoc.DocumentNode.SelectSingleNode(PriceXPath);
            if (metaPrice != null)
            {
                item.Price = GetPriceValue(metaPrice);
            }

            var content = htmlDoc.DocumentNode.SelectSingleNode(DescriptionXPath);
            if (content != null)
            {
                item.Content = Regex.Replace(content.InnerHtml, "<a.*>.*</a>", "");
                item.Content = RefineContent(item.Content);
                var converterFunctions = new ConverterFunctions();
                item.WordCount = converterFunctions.StripTags(content.InnerHtml, new List<string>()).WordCount();

            }
            item.Title = title;
            var regex = new Regex(IdRegex);
            var match = regex.Match(url);
            if (match.Success)
            {
                item.Id = Int32.Parse(match.Groups[1].Value);
            }
            item.Site = Name;
            SetCreatedDate(htmlDoc, item);
            return item;
        }