/// <summary> /// Calculates lunch menu scores for a Html-document. /// </summary> public static LunchMenuScores GetScoresForHtmlDocument(LunchRestaurantDocument lunchMenuDocument) { // let's create a new detection based on the basic lunch menu keywords var detection = ServiceLocator.Instance.Container.Resolve<ILunchMenuDetection>(); // let's calculate points for this document var scorePoints = detection.GetScorePointsForDocument(lunchMenuDocument); // let's update the detection count for found lunch menu keywords and deep link keywords detection.UpdateLunchMenuKeywordCountsDB(); // let's wrap the scores var scores = new LunchMenuScores { Points = scorePoints, }; // if probability was below the limit - we'll try to find potential deep links in the document if (scores.LunchMenuProbability < Settings.Default.LunchMenuProbabilityLimit) { var deepLinks = new List<RestaurantDeepLink>(); detection.FindDeepLinks(lunchMenuDocument, deepLinks); detection.UpdateDeepLinkKeywordCountsDB(); scores.DeepLinks = deepLinks; } return scores; }
/// <summary> /// Attempts to fetch and load a HtmlDocument for a given URL. /// Also determines the MIME-type for the stream and computes a hash if needed. /// </summary> /// <param name="url">URL to be loaded.</param> /// <param name="timeout">Timeout for HttpWebRequest in seconds.</param> public static LunchRestaurantDocument GetLunchRestaurantDocumentForUrl(string url, int timeout) { var document = new LunchRestaurantDocument { URL = url }; var htmlDoc = new HtmlDocument(); var allowedmimetypes = new[] { "text/html", "text/xml" }; const int buffsize = 1024; try { var request = (HttpWebRequest)WebRequest.Create(GetUri(url)); request.Timeout = timeout * 1000; using (var response = (HttpWebResponse)request.GetResponse()) { var headerEncoding = TryGetEncoding(response.ContentEncoding) ?? TryGetEncoding(response.CharacterSet) ?? Encoding.UTF8; var buf = new byte[buffsize]; var ms = new MemoryStream(); var responseStream = response.GetResponseStream(); if (responseStream == null) { return null; } var count = responseStream.Read(buf, 0, buffsize); document.MimeType = MimeDetector.DetermineMIMEType(buf); if (Array.Exists(allowedmimetypes, mimetype => mimetype.Equals(document.MimeType))) { do ms.Write(buf, 0, count); while ((count = responseStream.Read(buf, 0, buffsize)) != 0); var bytes = ms.GetBuffer(); var docEncoding = htmlDoc.DetectEncodingHtml(headerEncoding.GetString(bytes)); var convertedBytes = Encoding.Convert(docEncoding ?? headerEncoding, Encoding.Unicode, bytes); var convertedData = Encoding.Unicode.GetString(convertedBytes); htmlDoc.LoadHtml(convertedData); } else { _logger.Info("Discarded invalid mimetype '{0}' for URL: {1}", document.MimeType, url); } } } catch { return null; } if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0) { // TODO: handle any parse errors } if (htmlDoc.DocumentNode != null) { document.HtmlDocument = htmlDoc; // let's also compute a hash for the document document.Hash = ComputeHashForDocument(htmlDoc, url); } return document; }
/// <summary> /// Seeks deep-links recursively. /// </summary> /// <param name="pageDocument">HTML document to be checked.</param> /// <param name="validDeepLinks">Deep links that scored high enough.</param> /// <param name="checkedDeepLinks">Deep links that were already crawled. used to prevent loops.</param> /// <param name="level">Level of recursion.</param> public void FindDeepLinks(LunchRestaurantDocument pageDocument, IList<RestaurantDeepLink> validDeepLinks, IList<string> checkedDeepLinks, int level) { var maxLevel = Settings.Default.DeepLinkRecursionLevel; if (level >= maxLevel) { return; } // let's first collect all valid links for the document var links = pageDocument.HtmlDocument .DocumentNode .DescendantNodes() .Where(Utils.IsLink) .Select(node => node.Attributes["href"].Value) .ToList(); foreach (var link in links) { // first, let's construct a full deep link based on the base url var fullDeepLinkUrl = link.StartsWith("http") ? link : ConstructDeepLinkUrl(pageDocument.URL, link); // let's score the link itself var scoreForLink = ScoreDeepLink(fullDeepLinkUrl); // we'll only continue, if this link hasn't been checked yet and it has a high deeplink score if (!IsValidDeepLink(fullDeepLinkUrl, scoreForLink, checkedDeepLinks)) { continue; } checkedDeepLinks.Add(fullDeepLinkUrl); Logger.Info("analyzing a deep link: " + fullDeepLinkUrl); // let's score the document that the link points to var deepLinkDocument = Utils.GetLunchRestaurantDocumentForUrl(fullDeepLinkUrl, Settings.Default.HTTPTimeoutSeconds); if (deepLinkDocument == null) { continue; } var pointsForDeepLinkDocument = GetScorePointsForDocument(deepLinkDocument); var deepLinkScores = new LunchMenuScores { Points = pointsForDeepLinkDocument, DeepLinkScorePoint = scoreForLink }; // if this link gets a high score, we'll add it as a deep link // (deep links have a separate probability since they can be a lot different than full menus) if (deepLinkScores.DeepLinkProbability > Settings.Default.LunchMenuProbabilityLimit) { validDeepLinks.Add(new RestaurantDeepLink { ContentType = (int)scoreForLink.DeepLinkContentType, DeepLinkURL = link, }); } // ..otherwise we'll go deeper else { FindDeepLinks(deepLinkDocument, validDeepLinks, checkedDeepLinks, level++); } } }
/// <summary> /// Seeks deep-links recursively. /// </summary> /// <param name="pageDocument">HTML document to be checked.</param> /// <param name="validDeepLinks">Deep links that scored high enough.</param> public void FindDeepLinks(LunchRestaurantDocument pageDocument, IList<RestaurantDeepLink> validDeepLinks) { FindDeepLinks(pageDocument, validDeepLinks, new List<string>(), 1); }
/// <summary> /// Calculates scorepoints for a given document. /// </summary> /// <param name="lunchMenuDocument">Document to be scored.</param> public IList<LunchMenuScorePoint> GetScorePointsForDocument(LunchRestaurantDocument lunchMenuDocument) { return lunchMenuDocument.HtmlDocument .DocumentNode .DescendantNodes() .Where(node => !Utils.ShouldSkipNode(node)) .Select(ScoreNode) .Where(scored => scored.DetectionLocation != LunchMenuDetectionLocation.Unknown) .ToList(); }
/// <summary> /// Completes a potential lunch restaurant instance by calculating points and probability for it. /// </summary> private static void CompletePotentialLunchRestaurant(LunchRestaurantDocument lunchMenuDocument, LunchRestaurant potentialRestaurant, LunchMenuScores scores) { potentialRestaurant.SiteHash = lunchMenuDocument.Hash; potentialRestaurant.TotalPoints = scores.Points.Sum(p => p.PointsGiven); potentialRestaurant.LunchMenuProbability = scores.LunchMenuProbability; potentialRestaurant.TotalKeywordDetections = scores.Points.Count(p => p.DetectionType != StringMatchType.NoMatch); potentialRestaurant.ExactKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Exact); potentialRestaurant.PartialKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Partial); potentialRestaurant.FuzzyKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Fuzzy); }
public static bool ShouldSkipAnalysis(LunchRestaurantDocument doc) { var existingDoc = LunchDA.Instance.FindExistingLunchRestaurant(doc.URL); return existingDoc != null && existingDoc.SiteHash.Equals(doc.Hash); }