public void PrintLunchMenuScores(string url, LunchRestaurantStatus status, LunchMenuScores scores) { //Console.OutputEncoding = Encoding.Default; var scoreBuilder = new StringBuilder(); scoreBuilder.AppendFormat("scores for URL: {0}\n", url); scoreBuilder.AppendFormat("- status: {0} - total points: {1} - deep links: {2} - lunch menu probability: {3:P}\n", status, scores.Points.Sum(p => p.PointsGiven), scores.DeepLinks != null ? scores.DeepLinks.Count : 0, scores.LunchMenuProbability); //foreach (var scorePoint in scores.Points.OrderByDescending(p => p.PointsGiven)) //{ // var consoledata = Utils.CleanContentForConsole(scorePoint.DetectedText); // scoreBuilder.AppendFormat("{0,2:00}: {1}\t -> {2}\n", // scorePoint.PointsGiven, // scorePoint.DetectedKeyword, // consoledata); //} scoreBuilder.AppendLine("\r\n-----------------------------------------------------------------------------"); var lunchMenuScores = scoreBuilder.ToString(); Console.WriteLine(lunchMenuScores); }
/// <summary> /// Calculates lunch menu scores for a Html-document. /// </summary> public static LunchMenuScores GetScoresForHtmlDocument(LunchRestaurantDocument lunchMenuDocument) { // let's create a new detection based on the basic lunch menu keywords var detection = ServiceLocator.Instance.Container.Resolve<ILunchMenuDetection>(); // let's calculate points for this document var scorePoints = detection.GetScorePointsForDocument(lunchMenuDocument); // let's update the detection count for found lunch menu keywords and deep link keywords detection.UpdateLunchMenuKeywordCountsDB(); // let's wrap the scores var scores = new LunchMenuScores { Points = scorePoints, }; // if probability was below the limit - we'll try to find potential deep links in the document if (scores.LunchMenuProbability < Settings.Default.LunchMenuProbabilityLimit) { var deepLinks = new List<RestaurantDeepLink>(); detection.FindDeepLinks(lunchMenuDocument, deepLinks); detection.UpdateDeepLinkKeywordCountsDB(); scores.DeepLinks = deepLinks; } return scores; }
/// <summary> /// Seeks deep-links recursively. /// </summary> /// <param name="pageDocument">HTML document to be checked.</param> /// <param name="validDeepLinks">Deep links that scored high enough.</param> /// <param name="checkedDeepLinks">Deep links that were already crawled. used to prevent loops.</param> /// <param name="level">Level of recursion.</param> public void FindDeepLinks(LunchRestaurantDocument pageDocument, IList<RestaurantDeepLink> validDeepLinks, IList<string> checkedDeepLinks, int level) { var maxLevel = Settings.Default.DeepLinkRecursionLevel; if (level >= maxLevel) { return; } // let's first collect all valid links for the document var links = pageDocument.HtmlDocument .DocumentNode .DescendantNodes() .Where(Utils.IsLink) .Select(node => node.Attributes["href"].Value) .ToList(); foreach (var link in links) { // first, let's construct a full deep link based on the base url var fullDeepLinkUrl = link.StartsWith("http") ? link : ConstructDeepLinkUrl(pageDocument.URL, link); // let's score the link itself var scoreForLink = ScoreDeepLink(fullDeepLinkUrl); // we'll only continue, if this link hasn't been checked yet and it has a high deeplink score if (!IsValidDeepLink(fullDeepLinkUrl, scoreForLink, checkedDeepLinks)) { continue; } checkedDeepLinks.Add(fullDeepLinkUrl); Logger.Info("analyzing a deep link: " + fullDeepLinkUrl); // let's score the document that the link points to var deepLinkDocument = Utils.GetLunchRestaurantDocumentForUrl(fullDeepLinkUrl, Settings.Default.HTTPTimeoutSeconds); if (deepLinkDocument == null) { continue; } var pointsForDeepLinkDocument = GetScorePointsForDocument(deepLinkDocument); var deepLinkScores = new LunchMenuScores { Points = pointsForDeepLinkDocument, DeepLinkScorePoint = scoreForLink }; // if this link gets a high score, we'll add it as a deep link // (deep links have a separate probability since they can be a lot different than full menus) if (deepLinkScores.DeepLinkProbability > Settings.Default.LunchMenuProbabilityLimit) { validDeepLinks.Add(new RestaurantDeepLink { ContentType = (int)scoreForLink.DeepLinkContentType, DeepLinkURL = link, }); } // ..otherwise we'll go deeper else { FindDeepLinks(deepLinkDocument, validDeepLinks, checkedDeepLinks, level++); } } }
/// <summary> /// Completes a potential lunch restaurant instance by calculating points and probability for it. /// </summary> private static void CompletePotentialLunchRestaurant(LunchRestaurantDocument lunchMenuDocument, LunchRestaurant potentialRestaurant, LunchMenuScores scores) { potentialRestaurant.SiteHash = lunchMenuDocument.Hash; potentialRestaurant.TotalPoints = scores.Points.Sum(p => p.PointsGiven); potentialRestaurant.LunchMenuProbability = scores.LunchMenuProbability; potentialRestaurant.TotalKeywordDetections = scores.Points.Count(p => p.DetectionType != StringMatchType.NoMatch); potentialRestaurant.ExactKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Exact); potentialRestaurant.PartialKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Partial); potentialRestaurant.FuzzyKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Fuzzy); }