/// <summary>
        /// Starts a analyzing task for a given URL.
        /// The task will be cancelled if it times out.
        /// </summary>
        private void AnalyzeLunchRestaurantWithTimeout(LunchRestaurant restaurant)
        {
            // let's analyze based on parsing strategies' priority
            var strategies = _parsingStrategies.OrderByDescending(st => st.Priority);

            // we'll cancel the process after timeout in case parsing goes wrong
            // TODO: timeout settings in DB
            const int timeoutForProcessing = 240000;
            var cts = new CancellationTokenSource();
            var token = cts.Token;
            new Thread(() =>
            {
                Thread.Sleep(timeoutForProcessing);
                cts.Cancel();
            }).Start();

            try
            {
                Task.Factory.StartNew(() =>
                {
                    LunchMenu parseResult = null;

                    foreach (var strategy in strategies)
                    {
                        parseResult = strategy.ParseLunchMenu(restaurant);

                        // TODO: confidence settings in DB
                        if (parseResult.Confidence > 0.90M)
                        {
                            break;
                        }
                    }

                    CompleteLunchRestaurantAnalysis(parseResult);
                }).Wait(token);
            }
            catch (OperationCanceledException)
            {
                Logger.ErrorFormat("Analysis timed out for URL: {0}", restaurant.AbsoluteURL);
            }

            Console.WriteLine("---------------------------------------------------------------------------------\n");
        }
Example #2
0
        /// <summary>
        /// Adds a new lunch restaurant or updates an existing one.
        /// </summary>
        public void UpdateLunchRestaurant(LunchRestaurant lunchRestaurant)
        {
            try
            {
                using (var entityContext = new LunchEntities())
                {
                    var existingUrl = FindExistingLunchRestaurant(lunchRestaurant.URL);
                    if (existingUrl != null)
                    {
                        existingUrl.Status = lunchRestaurant.Status;
                        existingUrl.SiteHash = lunchRestaurant.SiteHash;
                        existingUrl.LunchMenuProbability = lunchRestaurant.LunchMenuProbability;
                        existingUrl.TotalPoints = lunchRestaurant.TotalPoints;
                        existingUrl.TotalKeywordDetections = lunchRestaurant.TotalKeywordDetections;
                        existingUrl.ExactKeywordDetections = lunchRestaurant.ExactKeywordDetections;
                        existingUrl.PartialKeywordDetections = lunchRestaurant.PartialKeywordDetections;
                        existingUrl.FuzzyKeywordDetections = lunchRestaurant.FuzzyKeywordDetections;
                        existingUrl.DateUpdated = DateTime.UtcNow;
                    }
                    else
                    {
                        lunchRestaurant.DateAdded = DateTime.UtcNow;
                        entityContext.LunchRestaurants.AddObject(lunchRestaurant);
                    }

                    entityContext.SaveChanges();
                }
            }
            catch (UpdateException updateException)
            {
                var baseError = updateException.GetBaseException();
                if (baseError.Message.Contains("not unique"))
                {
                    return;
                }

                throw;
            }
        }
        /// <summary>
        /// Completes a potential lunch restaurant instance by calculating points and probability for it.
        /// </summary>
        private static void CompletePotentialLunchRestaurant(LunchRestaurantDocument lunchMenuDocument,
                                                             LunchRestaurant potentialRestaurant,
                                                             LunchMenuScores scores)
        {
            potentialRestaurant.SiteHash = lunchMenuDocument.Hash;
            potentialRestaurant.TotalPoints = scores.Points.Sum(p => p.PointsGiven);
            potentialRestaurant.LunchMenuProbability = scores.LunchMenuProbability;

            potentialRestaurant.TotalKeywordDetections   = scores.Points.Count(p => p.DetectionType != StringMatchType.NoMatch);
            potentialRestaurant.ExactKeywordDetections   = scores.Points.Count(p => p.DetectionType == StringMatchType.Exact);
            potentialRestaurant.PartialKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Partial);
            potentialRestaurant.FuzzyKeywordDetections   = scores.Points.Count(p => p.DetectionType == StringMatchType.Fuzzy);
        }
        /// <summary>
        /// Scores a single URL as a lunch restaurant.
        /// </summary>
        public void ScoreLunchRestaurant(string url)
        {
            var potentialRestaurant = new LunchRestaurant
            {
                URL = Utils.GetBaseUrl(url), // primary key
                AbsoluteURL = url,           // used for creating and parsing the model
                Status = (int)LunchRestaurantStatus.OK
            };

            try
            {
                // Check if we already have this one
                var existingMenu = LunchDA.Instance.FindExistingLunchRestaurant(potentialRestaurant.URL);
                if (existingMenu == null || existingMenu.Status == (int)LunchRestaurantStatus.CannotConnect)
                {
                    var lunchMenuDocument = Utils.GetLunchRestaurantDocumentForUrl(url, Settings.Default.HTTPTimeoutSeconds);
                    if (lunchMenuDocument == null)
                    {
                        // no special error handling for now, any HTTP error -> can't connect
                        potentialRestaurant.Status = (int)LunchRestaurantStatus.CannotConnect;
                        LogLunchMenuScores(url, LunchRestaurantStatus.CannotConnect, new LunchMenuScores());
                        LunchDA.Instance.UpdateLunchRestaurant(potentialRestaurant);
                        return;
                    }

                    // let's calculate and log scores
                    var scores = GetScoresForHtmlDocument(lunchMenuDocument);
                    LogLunchMenuScores(url, (LunchRestaurantStatus)potentialRestaurant.Status, scores);

                    // ..and let's finish the potential restaurant instance and update the DB
                    CompletePotentialLunchRestaurant(lunchMenuDocument, potentialRestaurant, scores);
                    LunchDA.Instance.UpdateLunchRestaurant(potentialRestaurant);
                    LunchDA.Instance.UpdateLunchRestaurantDeepLinks(potentialRestaurant.URL, scores.DeepLinks);
                }
            }
            catch (EntityException entityEx)
            {
                var errorMessage = entityEx.ParseInnerException();
                if (errorMessage.ToLowerInvariant().Contains("database is locked"))
                {
                    Logger.Fatal("SQLite database is locked.");
                    return;
                }
            }
            catch (Exception ex)
            {
                Logger.Fatal("Error scoring document for URL: {0} - {1}".With(url, ex.Message), ex);
            }
        }
        public LunchMenu ParseLunchMenu(LunchRestaurant restaurant)
        {
            var lunchMenu = new LunchMenu { RestaurantKey = restaurant.URL, Confidence = 0 };

            var doc = Utils.GetLunchRestaurantDocumentForUrl(restaurant.AbsoluteURL);
            if (ShouldSkipAnalysis(doc))
            {
                return lunchMenu;
            }

            var nodeCount = doc.HtmlDocument.DocumentNode.DescendantNodes().Count();

            // TODO: yliraja pisteille / noodeille jo Seeker puolelle - jumittaa muuten strategioissa
            if (nodeCount > 7000)
            {
                return lunchMenu;
            }

            // 1. first, let's collect and print the basic features for the document
            var features = SimpleFeatureDetector.DetectFeatures(doc.HtmlDocument)
                                                .Where(f => f.Type != LunchMenuFeatureType.Unknown)
                                                .ToList();

            // SimpleFeatureDetector.PrintDetectedFeatures(features);

            // 2. let's make sure all 5 weekdays are found
            if (!RequiredWeekdays.All(day => features.Any(f => f.Type == LunchMenuFeatureType.Weekday &&
                                                               f.InnerText.ToLowerInvariant().Contains(day))))
            {
                // we shouldn't continue with this strategy
                return lunchMenu;
            }

            // 3. let's find the index for the first weekday
            var firstWeekdayIndex = features.FindIndex(f => f.Type == LunchMenuFeatureType.Weekday &&
                                                            f.InnerText.ToLowerInvariant().Contains("maanantai"));
            if (firstWeekdayIndex == -1)
            {
                return lunchMenu;
            }

            // 4. let's loop through the detected features and collect food items for each weekday
            var currentWeekDay = WeekDay.Monday;
            for (var i = (firstWeekdayIndex + 1); i < features.Count; i++)
            {
                var currentFeature = features[i];
                LunchMenuFeature nextFeature = null;
                if (features.Count != (i + 1))
                {
                    nextFeature = features[i + 1];
                }

                if (currentFeature.Type == LunchMenuFeatureType.FoodItemAndPrice)
                {
                    AddFoodItemForWeekDay(lunchMenu, currentWeekDay, currentFeature);
                }
                else if (currentFeature.Type == LunchMenuFeatureType.FoodItem &&
                         nextFeature != null && nextFeature.Type == LunchMenuFeatureType.Price)
                {
                    AddFoodItemForWeekDay(lunchMenu, currentWeekDay, currentFeature, nextFeature);
                }
                else if (currentFeature.Type == LunchMenuFeatureType.Weekday)
                {
                    // we'll only accept detected weekday as the next weekday if weekdays are in correct order
                    var detectedWeekday = ParseWeekDay(currentFeature.InnerText);
                    var nextWeekday = (WeekDay)Enum.ToObject(typeof(WeekDay), (int)currentWeekDay + 1);

                    if (detectedWeekday != nextWeekday &&
                        (lunchMenu.FoodItems == null || lunchMenu.FoodItems.Keys.Count < 5))
                    {
                        return lunchMenu;
                    }

                    currentWeekDay = detectedWeekday;
                }
            }

            // 5. finally, let's make sure we have atleast some food items for each day
            var detectedDays = lunchMenu.FoodItems.Keys.Count;
            var foodItemsForEachWeekday = lunchMenu.FoodItems.GroupBy(f => f.Key).All(g => g.Count() > 0);

            if (detectedDays >= 5 && foodItemsForEachWeekday)
            {
                lunchMenu.Confidence = 1;
            }

            if (lunchMenu.FoodItems.Count > 0)
            {
                var result = new StringBuilder("results for {0}:\n".With(restaurant.AbsoluteURL));
                foreach (var weekDayItems in lunchMenu.FoodItems.GroupBy(f => f.Key).Where(g => g.Any()))
                {
                    result.AppendFormat("-> {0} - {1} food items\n", weekDayItems.Key, weekDayItems.Count());
                }

                Logger.Info(result.ToString());
            }
            else
            {
                Logger.InfoFormat("{0} - no results.\n", restaurant.AbsoluteURL);
            }

            return lunchMenu;
        }
 /// <summary>
 /// Create a new LunchRestaurant object.
 /// </summary>
 /// <param name="uRL">Initial value of the URL property.</param>
 /// <param name="absoluteURL">Initial value of the AbsoluteURL property.</param>
 /// <param name="status">Initial value of the Status property.</param>
 /// <param name="modelStatus">Initial value of the ModelStatus property.</param>
 /// <param name="totalPoints">Initial value of the TotalPoints property.</param>
 /// <param name="lunchMenuProbability">Initial value of the LunchMenuProbability property.</param>
 /// <param name="dateAdded">Initial value of the DateAdded property.</param>
 public static LunchRestaurant CreateLunchRestaurant(global::System.String uRL, global::System.String absoluteURL, global::System.Int32 status, global::System.Int32 modelStatus, global::System.Int32 totalPoints, global::System.Decimal lunchMenuProbability, global::System.DateTime dateAdded)
 {
     LunchRestaurant lunchRestaurant = new LunchRestaurant();
     lunchRestaurant.URL = uRL;
     lunchRestaurant.AbsoluteURL = absoluteURL;
     lunchRestaurant.Status = status;
     lunchRestaurant.ModelStatus = modelStatus;
     lunchRestaurant.TotalPoints = totalPoints;
     lunchRestaurant.LunchMenuProbability = lunchMenuProbability;
     lunchRestaurant.DateAdded = dateAdded;
     return lunchRestaurant;
 }
 /// <summary>
 /// Deprecated Method for adding a new object to the LunchRestaurants EntitySet. Consider using the .Add method of the associated ObjectSet&lt;T&gt; property instead.
 /// </summary>
 public void AddToLunchRestaurants(LunchRestaurant lunchRestaurant)
 {
     base.AddObject("LunchRestaurants", lunchRestaurant);
 }