Esempio n. 1
0
        public void PrintLunchMenuScores(string url, LunchRestaurantStatus status, LunchMenuScores scores)
        {
            //Console.OutputEncoding = Encoding.Default;

            var scoreBuilder = new StringBuilder();

            scoreBuilder.AppendFormat("scores for URL: {0}\n", url);
            scoreBuilder.AppendFormat("- status: {0} - total points: {1} - deep links: {2} - lunch menu probability: {3:P}\n",
                                      status,
                                      scores.Points.Sum(p => p.PointsGiven),
                                      scores.DeepLinks != null ? scores.DeepLinks.Count : 0,
                                      scores.LunchMenuProbability);

            //foreach (var scorePoint in scores.Points.OrderByDescending(p => p.PointsGiven))
            //{
            //    var consoledata = Utils.CleanContentForConsole(scorePoint.DetectedText);
            //    scoreBuilder.AppendFormat("{0,2:00}: {1}\t -> {2}\n",
            //                              scorePoint.PointsGiven,
            //                              scorePoint.DetectedKeyword,
            //                              consoledata);
            //}

            scoreBuilder.AppendLine("\r\n-----------------------------------------------------------------------------");
            var lunchMenuScores = scoreBuilder.ToString();
            Console.WriteLine(lunchMenuScores);
        }
        /// <summary>
        /// Calculates lunch menu scores for a Html-document.
        /// </summary>
        public static LunchMenuScores GetScoresForHtmlDocument(LunchRestaurantDocument lunchMenuDocument)
        {
            // let's create a new detection based on the basic lunch menu keywords
            var detection = ServiceLocator.Instance.Container.Resolve<ILunchMenuDetection>();

            // let's calculate points for this document
            var scorePoints = detection.GetScorePointsForDocument(lunchMenuDocument);

            // let's update the detection count for found lunch menu keywords and deep link keywords
            detection.UpdateLunchMenuKeywordCountsDB();

            // let's wrap the scores
            var scores = new LunchMenuScores
            {
                Points = scorePoints,
            };

            // if probability was below the limit - we'll try to find potential deep links in the document
            if (scores.LunchMenuProbability < Settings.Default.LunchMenuProbabilityLimit)
            {
                var deepLinks = new List<RestaurantDeepLink>();
                detection.FindDeepLinks(lunchMenuDocument, deepLinks);
                detection.UpdateDeepLinkKeywordCountsDB();
                scores.DeepLinks = deepLinks;
            }

            return scores;
        }
Esempio n. 3
0
        /// <summary>
        /// Seeks deep-links recursively.
        /// </summary>
        /// <param name="pageDocument">HTML document to be checked.</param>
        /// <param name="validDeepLinks">Deep links that scored high enough.</param>
        /// <param name="checkedDeepLinks">Deep links that were already crawled. used to prevent loops.</param>
        /// <param name="level">Level of recursion.</param>
        public void FindDeepLinks(LunchRestaurantDocument pageDocument,
                                  IList<RestaurantDeepLink> validDeepLinks,
                                  IList<string> checkedDeepLinks,
                                  int level)
        {
            var maxLevel = Settings.Default.DeepLinkRecursionLevel;
            if (level >= maxLevel)
            {
                return;
            }

            // let's first collect all valid links for the document
            var links = pageDocument.HtmlDocument
                                    .DocumentNode
                                    .DescendantNodes()
                                    .Where(Utils.IsLink)
                                    .Select(node => node.Attributes["href"].Value)
                                    .ToList();

            foreach (var link in links)
            {
                // first, let's construct a full deep link based on the base url
                var fullDeepLinkUrl = link.StartsWith("http") ? link : ConstructDeepLinkUrl(pageDocument.URL, link);

                // let's score the link itself
                var scoreForLink = ScoreDeepLink(fullDeepLinkUrl);

                // we'll only continue, if this link hasn't been checked yet and it has a high deeplink score
                if (!IsValidDeepLink(fullDeepLinkUrl, scoreForLink, checkedDeepLinks))
                {
                    continue;
                }

                checkedDeepLinks.Add(fullDeepLinkUrl);

                Logger.Info("analyzing a deep link: " + fullDeepLinkUrl);

                // let's score the document that the link points to
                var deepLinkDocument = Utils.GetLunchRestaurantDocumentForUrl(fullDeepLinkUrl, Settings.Default.HTTPTimeoutSeconds);
                if (deepLinkDocument == null)
                {
                    continue;
                }
                var pointsForDeepLinkDocument = GetScorePointsForDocument(deepLinkDocument);
                var deepLinkScores = new LunchMenuScores
                {
                    Points = pointsForDeepLinkDocument,
                    DeepLinkScorePoint = scoreForLink
                };

                // if this link gets a high score, we'll add it as a deep link
                // (deep links have a separate probability since they can be a lot different than full menus)
                if (deepLinkScores.DeepLinkProbability > Settings.Default.LunchMenuProbabilityLimit)
                {
                    validDeepLinks.Add(new RestaurantDeepLink
                    {
                        ContentType = (int)scoreForLink.DeepLinkContentType,
                        DeepLinkURL = link,
                    });
                }
                // ..otherwise we'll go deeper
                else
                {
                    FindDeepLinks(deepLinkDocument, validDeepLinks, checkedDeepLinks, level++);
                }
            }
        }
        /// <summary>
        /// Completes a potential lunch restaurant instance by calculating points and probability for it.
        /// </summary>
        private static void CompletePotentialLunchRestaurant(LunchRestaurantDocument lunchMenuDocument,
                                                             LunchRestaurant potentialRestaurant,
                                                             LunchMenuScores scores)
        {
            potentialRestaurant.SiteHash = lunchMenuDocument.Hash;
            potentialRestaurant.TotalPoints = scores.Points.Sum(p => p.PointsGiven);
            potentialRestaurant.LunchMenuProbability = scores.LunchMenuProbability;

            potentialRestaurant.TotalKeywordDetections   = scores.Points.Count(p => p.DetectionType != StringMatchType.NoMatch);
            potentialRestaurant.ExactKeywordDetections   = scores.Points.Count(p => p.DetectionType == StringMatchType.Exact);
            potentialRestaurant.PartialKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Partial);
            potentialRestaurant.FuzzyKeywordDetections   = scores.Points.Count(p => p.DetectionType == StringMatchType.Fuzzy);
        }