Ejemplo n.º 1
0
        /// <summary>
        /// Calculates lunch menu scores for a Html-document.
        /// </summary>
        public static LunchMenuScores GetScoresForHtmlDocument(LunchRestaurantDocument lunchMenuDocument)
        {
            // let's create a new detection based on the basic lunch menu keywords
            var detection = ServiceLocator.Instance.Container.Resolve<ILunchMenuDetection>();

            // let's calculate points for this document
            var scorePoints = detection.GetScorePointsForDocument(lunchMenuDocument);

            // let's update the detection count for found lunch menu keywords and deep link keywords
            detection.UpdateLunchMenuKeywordCountsDB();

            // let's wrap the scores
            var scores = new LunchMenuScores
            {
                Points = scorePoints,
            };

            // if probability was below the limit - we'll try to find potential deep links in the document
            if (scores.LunchMenuProbability < Settings.Default.LunchMenuProbabilityLimit)
            {
                var deepLinks = new List<RestaurantDeepLink>();
                detection.FindDeepLinks(lunchMenuDocument, deepLinks);
                detection.UpdateDeepLinkKeywordCountsDB();
                scores.DeepLinks = deepLinks;
            }

            return scores;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Attempts to fetch and load a HtmlDocument for a given URL.
        /// Also determines the MIME-type for the stream and computes a hash if needed.
        /// </summary>
        /// <param name="url">URL to be loaded.</param>
        /// <param name="timeout">Timeout for HttpWebRequest in seconds.</param>
        public static LunchRestaurantDocument GetLunchRestaurantDocumentForUrl(string url, int timeout)
        {
            var document = new LunchRestaurantDocument { URL = url };
            var htmlDoc = new HtmlDocument();
            var allowedmimetypes = new[] { "text/html", "text/xml" };

            const int buffsize = 1024;

            try
            {
                var request = (HttpWebRequest)WebRequest.Create(GetUri(url));
                request.Timeout = timeout * 1000;
                using (var response = (HttpWebResponse)request.GetResponse())
                {
                    var headerEncoding = TryGetEncoding(response.ContentEncoding) ??
                                         TryGetEncoding(response.CharacterSet) ??
                                         Encoding.UTF8;

                    var buf = new byte[buffsize];
                    var ms = new MemoryStream();
                    var responseStream = response.GetResponseStream();
                    if (responseStream == null)
                    {
                        return null;
                    }
                    var count = responseStream.Read(buf, 0, buffsize);

                    document.MimeType = MimeDetector.DetermineMIMEType(buf);

                    if (Array.Exists(allowedmimetypes, mimetype => mimetype.Equals(document.MimeType)))
                    {
                        do
                            ms.Write(buf, 0, count);
                        while ((count = responseStream.Read(buf, 0, buffsize)) != 0);

                        var bytes = ms.GetBuffer();

                        var docEncoding = htmlDoc.DetectEncodingHtml(headerEncoding.GetString(bytes));
                        var convertedBytes = Encoding.Convert(docEncoding ?? headerEncoding, Encoding.Unicode, bytes);
                        var convertedData = Encoding.Unicode.GetString(convertedBytes);

                        htmlDoc.LoadHtml(convertedData);
                    }
                    else
                    {
                        _logger.Info("Discarded invalid mimetype '{0}' for URL: {1}", document.MimeType, url);
                    }
                }
            }
            catch
            {
                return null;
            }

            if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0)
            {
                // TODO: handle any parse errors
            }

            if (htmlDoc.DocumentNode != null)
            {
                document.HtmlDocument = htmlDoc;

                // let's also compute a hash for the document
                document.Hash = ComputeHashForDocument(htmlDoc, url);
            }

            return document;
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Seeks deep-links recursively.
        /// </summary>
        /// <param name="pageDocument">HTML document to be checked.</param>
        /// <param name="validDeepLinks">Deep links that scored high enough.</param>
        /// <param name="checkedDeepLinks">Deep links that were already crawled. used to prevent loops.</param>
        /// <param name="level">Level of recursion.</param>
        public void FindDeepLinks(LunchRestaurantDocument pageDocument,
                                  IList<RestaurantDeepLink> validDeepLinks,
                                  IList<string> checkedDeepLinks,
                                  int level)
        {
            var maxLevel = Settings.Default.DeepLinkRecursionLevel;
            if (level >= maxLevel)
            {
                return;
            }

            // let's first collect all valid links for the document
            var links = pageDocument.HtmlDocument
                                    .DocumentNode
                                    .DescendantNodes()
                                    .Where(Utils.IsLink)
                                    .Select(node => node.Attributes["href"].Value)
                                    .ToList();

            foreach (var link in links)
            {
                // first, let's construct a full deep link based on the base url
                var fullDeepLinkUrl = link.StartsWith("http") ? link : ConstructDeepLinkUrl(pageDocument.URL, link);

                // let's score the link itself
                var scoreForLink = ScoreDeepLink(fullDeepLinkUrl);

                // we'll only continue, if this link hasn't been checked yet and it has a high deeplink score
                if (!IsValidDeepLink(fullDeepLinkUrl, scoreForLink, checkedDeepLinks))
                {
                    continue;
                }

                checkedDeepLinks.Add(fullDeepLinkUrl);

                Logger.Info("analyzing a deep link: " + fullDeepLinkUrl);

                // let's score the document that the link points to
                var deepLinkDocument = Utils.GetLunchRestaurantDocumentForUrl(fullDeepLinkUrl, Settings.Default.HTTPTimeoutSeconds);
                if (deepLinkDocument == null)
                {
                    continue;
                }
                var pointsForDeepLinkDocument = GetScorePointsForDocument(deepLinkDocument);
                var deepLinkScores = new LunchMenuScores
                {
                    Points = pointsForDeepLinkDocument,
                    DeepLinkScorePoint = scoreForLink
                };

                // if this link gets a high score, we'll add it as a deep link
                // (deep links have a separate probability since they can be a lot different than full menus)
                if (deepLinkScores.DeepLinkProbability > Settings.Default.LunchMenuProbabilityLimit)
                {
                    validDeepLinks.Add(new RestaurantDeepLink
                    {
                        ContentType = (int)scoreForLink.DeepLinkContentType,
                        DeepLinkURL = link,
                    });
                }
                // ..otherwise we'll go deeper
                else
                {
                    FindDeepLinks(deepLinkDocument, validDeepLinks, checkedDeepLinks, level++);
                }
            }
        }
Ejemplo n.º 4
0
 /// <summary>
 /// Seeks deep-links recursively.
 /// </summary>
 /// <param name="pageDocument">HTML document to be checked.</param>
 /// <param name="validDeepLinks">Deep links that scored high enough.</param>
 public void FindDeepLinks(LunchRestaurantDocument pageDocument, IList<RestaurantDeepLink> validDeepLinks)
 {
     FindDeepLinks(pageDocument, validDeepLinks, new List<string>(), 1);
 }
Ejemplo n.º 5
0
 /// <summary>
 /// Calculates scorepoints for a given document.
 /// </summary>
 /// <param name="lunchMenuDocument">Document to be scored.</param>
 public IList<LunchMenuScorePoint> GetScorePointsForDocument(LunchRestaurantDocument lunchMenuDocument)
 {
     return lunchMenuDocument.HtmlDocument
                             .DocumentNode
                             .DescendantNodes()
                             .Where(node => !Utils.ShouldSkipNode(node))
                             .Select(ScoreNode)
                             .Where(scored => scored.DetectionLocation != LunchMenuDetectionLocation.Unknown)
                             .ToList();
 }
Ejemplo n.º 6
0
        /// <summary>
        /// Completes a potential lunch restaurant instance by calculating points and probability for it.
        /// </summary>
        private static void CompletePotentialLunchRestaurant(LunchRestaurantDocument lunchMenuDocument,
                                                             LunchRestaurant potentialRestaurant,
                                                             LunchMenuScores scores)
        {
            potentialRestaurant.SiteHash = lunchMenuDocument.Hash;
            potentialRestaurant.TotalPoints = scores.Points.Sum(p => p.PointsGiven);
            potentialRestaurant.LunchMenuProbability = scores.LunchMenuProbability;

            potentialRestaurant.TotalKeywordDetections   = scores.Points.Count(p => p.DetectionType != StringMatchType.NoMatch);
            potentialRestaurant.ExactKeywordDetections   = scores.Points.Count(p => p.DetectionType == StringMatchType.Exact);
            potentialRestaurant.PartialKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Partial);
            potentialRestaurant.FuzzyKeywordDetections   = scores.Points.Count(p => p.DetectionType == StringMatchType.Fuzzy);
        }
Ejemplo n.º 7
0
 public static bool ShouldSkipAnalysis(LunchRestaurantDocument doc)
 {
     var existingDoc = LunchDA.Instance.FindExistingLunchRestaurant(doc.URL);
     return existingDoc != null && existingDoc.SiteHash.Equals(doc.Hash);
 }