/// <summary> /// Starts a analyzing task for a given URL. /// The task will be cancelled if it times out. /// </summary> private void AnalyzeLunchRestaurantWithTimeout(LunchRestaurant restaurant) { // let's analyze based on parsing strategies' priority var strategies = _parsingStrategies.OrderByDescending(st => st.Priority); // we'll cancel the process after timeout in case parsing goes wrong // TODO: timeout settings in DB const int timeoutForProcessing = 240000; var cts = new CancellationTokenSource(); var token = cts.Token; new Thread(() => { Thread.Sleep(timeoutForProcessing); cts.Cancel(); }).Start(); try { Task.Factory.StartNew(() => { LunchMenu parseResult = null; foreach (var strategy in strategies) { parseResult = strategy.ParseLunchMenu(restaurant); // TODO: confidence settings in DB if (parseResult.Confidence > 0.90M) { break; } } CompleteLunchRestaurantAnalysis(parseResult); }).Wait(token); } catch (OperationCanceledException) { Logger.ErrorFormat("Analysis timed out for URL: {0}", restaurant.AbsoluteURL); } Console.WriteLine("---------------------------------------------------------------------------------\n"); }
/// <summary> /// Adds a new lunch restaurant or updates an existing one. /// </summary> public void UpdateLunchRestaurant(LunchRestaurant lunchRestaurant) { try { using (var entityContext = new LunchEntities()) { var existingUrl = FindExistingLunchRestaurant(lunchRestaurant.URL); if (existingUrl != null) { existingUrl.Status = lunchRestaurant.Status; existingUrl.SiteHash = lunchRestaurant.SiteHash; existingUrl.LunchMenuProbability = lunchRestaurant.LunchMenuProbability; existingUrl.TotalPoints = lunchRestaurant.TotalPoints; existingUrl.TotalKeywordDetections = lunchRestaurant.TotalKeywordDetections; existingUrl.ExactKeywordDetections = lunchRestaurant.ExactKeywordDetections; existingUrl.PartialKeywordDetections = lunchRestaurant.PartialKeywordDetections; existingUrl.FuzzyKeywordDetections = lunchRestaurant.FuzzyKeywordDetections; existingUrl.DateUpdated = DateTime.UtcNow; } else { lunchRestaurant.DateAdded = DateTime.UtcNow; entityContext.LunchRestaurants.AddObject(lunchRestaurant); } entityContext.SaveChanges(); } } catch (UpdateException updateException) { var baseError = updateException.GetBaseException(); if (baseError.Message.Contains("not unique")) { return; } throw; } }
/// <summary> /// Completes a potential lunch restaurant instance by calculating points and probability for it. /// </summary> private static void CompletePotentialLunchRestaurant(LunchRestaurantDocument lunchMenuDocument, LunchRestaurant potentialRestaurant, LunchMenuScores scores) { potentialRestaurant.SiteHash = lunchMenuDocument.Hash; potentialRestaurant.TotalPoints = scores.Points.Sum(p => p.PointsGiven); potentialRestaurant.LunchMenuProbability = scores.LunchMenuProbability; potentialRestaurant.TotalKeywordDetections = scores.Points.Count(p => p.DetectionType != StringMatchType.NoMatch); potentialRestaurant.ExactKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Exact); potentialRestaurant.PartialKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Partial); potentialRestaurant.FuzzyKeywordDetections = scores.Points.Count(p => p.DetectionType == StringMatchType.Fuzzy); }
/// <summary> /// Scores a single URL as a lunch restaurant. /// </summary> public void ScoreLunchRestaurant(string url) { var potentialRestaurant = new LunchRestaurant { URL = Utils.GetBaseUrl(url), // primary key AbsoluteURL = url, // used for creating and parsing the model Status = (int)LunchRestaurantStatus.OK }; try { // Check if we already have this one var existingMenu = LunchDA.Instance.FindExistingLunchRestaurant(potentialRestaurant.URL); if (existingMenu == null || existingMenu.Status == (int)LunchRestaurantStatus.CannotConnect) { var lunchMenuDocument = Utils.GetLunchRestaurantDocumentForUrl(url, Settings.Default.HTTPTimeoutSeconds); if (lunchMenuDocument == null) { // no special error handling for now, any HTTP error -> can't connect potentialRestaurant.Status = (int)LunchRestaurantStatus.CannotConnect; LogLunchMenuScores(url, LunchRestaurantStatus.CannotConnect, new LunchMenuScores()); LunchDA.Instance.UpdateLunchRestaurant(potentialRestaurant); return; } // let's calculate and log scores var scores = GetScoresForHtmlDocument(lunchMenuDocument); LogLunchMenuScores(url, (LunchRestaurantStatus)potentialRestaurant.Status, scores); // ..and let's finish the potential restaurant instance and update the DB CompletePotentialLunchRestaurant(lunchMenuDocument, potentialRestaurant, scores); LunchDA.Instance.UpdateLunchRestaurant(potentialRestaurant); LunchDA.Instance.UpdateLunchRestaurantDeepLinks(potentialRestaurant.URL, scores.DeepLinks); } } catch (EntityException entityEx) { var errorMessage = entityEx.ParseInnerException(); if (errorMessage.ToLowerInvariant().Contains("database is locked")) { Logger.Fatal("SQLite database is locked."); return; } } catch (Exception ex) { Logger.Fatal("Error scoring document for URL: {0} - {1}".With(url, ex.Message), ex); } }
public LunchMenu ParseLunchMenu(LunchRestaurant restaurant) { var lunchMenu = new LunchMenu { RestaurantKey = restaurant.URL, Confidence = 0 }; var doc = Utils.GetLunchRestaurantDocumentForUrl(restaurant.AbsoluteURL); if (ShouldSkipAnalysis(doc)) { return lunchMenu; } var nodeCount = doc.HtmlDocument.DocumentNode.DescendantNodes().Count(); // TODO: yliraja pisteille / noodeille jo Seeker puolelle - jumittaa muuten strategioissa if (nodeCount > 7000) { return lunchMenu; } // 1. first, let's collect and print the basic features for the document var features = SimpleFeatureDetector.DetectFeatures(doc.HtmlDocument) .Where(f => f.Type != LunchMenuFeatureType.Unknown) .ToList(); // SimpleFeatureDetector.PrintDetectedFeatures(features); // 2. let's make sure all 5 weekdays are found if (!RequiredWeekdays.All(day => features.Any(f => f.Type == LunchMenuFeatureType.Weekday && f.InnerText.ToLowerInvariant().Contains(day)))) { // we shouldn't continue with this strategy return lunchMenu; } // 3. let's find the index for the first weekday var firstWeekdayIndex = features.FindIndex(f => f.Type == LunchMenuFeatureType.Weekday && f.InnerText.ToLowerInvariant().Contains("maanantai")); if (firstWeekdayIndex == -1) { return lunchMenu; } // 4. let's loop through the detected features and collect food items for each weekday var currentWeekDay = WeekDay.Monday; for (var i = (firstWeekdayIndex + 1); i < features.Count; i++) { var currentFeature = features[i]; LunchMenuFeature nextFeature = null; if (features.Count != (i + 1)) { nextFeature = features[i + 1]; } if (currentFeature.Type == LunchMenuFeatureType.FoodItemAndPrice) { AddFoodItemForWeekDay(lunchMenu, currentWeekDay, currentFeature); } else if (currentFeature.Type == LunchMenuFeatureType.FoodItem && nextFeature != null && nextFeature.Type == LunchMenuFeatureType.Price) { AddFoodItemForWeekDay(lunchMenu, currentWeekDay, currentFeature, nextFeature); } else if (currentFeature.Type == LunchMenuFeatureType.Weekday) { // we'll only accept detected weekday as the next weekday if weekdays are in correct order var detectedWeekday = ParseWeekDay(currentFeature.InnerText); var nextWeekday = (WeekDay)Enum.ToObject(typeof(WeekDay), (int)currentWeekDay + 1); if (detectedWeekday != nextWeekday && (lunchMenu.FoodItems == null || lunchMenu.FoodItems.Keys.Count < 5)) { return lunchMenu; } currentWeekDay = detectedWeekday; } } // 5. finally, let's make sure we have atleast some food items for each day var detectedDays = lunchMenu.FoodItems.Keys.Count; var foodItemsForEachWeekday = lunchMenu.FoodItems.GroupBy(f => f.Key).All(g => g.Count() > 0); if (detectedDays >= 5 && foodItemsForEachWeekday) { lunchMenu.Confidence = 1; } if (lunchMenu.FoodItems.Count > 0) { var result = new StringBuilder("results for {0}:\n".With(restaurant.AbsoluteURL)); foreach (var weekDayItems in lunchMenu.FoodItems.GroupBy(f => f.Key).Where(g => g.Any())) { result.AppendFormat("-> {0} - {1} food items\n", weekDayItems.Key, weekDayItems.Count()); } Logger.Info(result.ToString()); } else { Logger.InfoFormat("{0} - no results.\n", restaurant.AbsoluteURL); } return lunchMenu; }
/// <summary> /// Create a new LunchRestaurant object. /// </summary> /// <param name="uRL">Initial value of the URL property.</param> /// <param name="absoluteURL">Initial value of the AbsoluteURL property.</param> /// <param name="status">Initial value of the Status property.</param> /// <param name="modelStatus">Initial value of the ModelStatus property.</param> /// <param name="totalPoints">Initial value of the TotalPoints property.</param> /// <param name="lunchMenuProbability">Initial value of the LunchMenuProbability property.</param> /// <param name="dateAdded">Initial value of the DateAdded property.</param> public static LunchRestaurant CreateLunchRestaurant(global::System.String uRL, global::System.String absoluteURL, global::System.Int32 status, global::System.Int32 modelStatus, global::System.Int32 totalPoints, global::System.Decimal lunchMenuProbability, global::System.DateTime dateAdded) { LunchRestaurant lunchRestaurant = new LunchRestaurant(); lunchRestaurant.URL = uRL; lunchRestaurant.AbsoluteURL = absoluteURL; lunchRestaurant.Status = status; lunchRestaurant.ModelStatus = modelStatus; lunchRestaurant.TotalPoints = totalPoints; lunchRestaurant.LunchMenuProbability = lunchMenuProbability; lunchRestaurant.DateAdded = dateAdded; return lunchRestaurant; }
/// <summary> /// Deprecated Method for adding a new object to the LunchRestaurants EntitySet. Consider using the .Add method of the associated ObjectSet<T> property instead. /// </summary> public void AddToLunchRestaurants(LunchRestaurant lunchRestaurant) { base.AddObject("LunchRestaurants", lunchRestaurant); }