/** * Generic Import Process Provider Properties Import Method Import URL/Path Import Frequency IsMaster Fetch Latest Data For each item Check If Exists or Strong Duplicate, Get ID If New, Add if Exists Then Prepare update, if provider supports live status, set that What if item updated manually on OCM? Send Update End Loop Log Exceptions Log Count of Items Added or Modified Way to remove item (or log items) which no longer exist in data source? * */ public async Task<List<ChargePoint>> DeDuplicateList(List<ChargePoint> cpList, bool updateDuplicate, CoreReferenceData coreRefData, ImportReport report, bool allowDupeWithDifferentOperator = false) { var stopWatch = new Stopwatch(); stopWatch.Start(); //get list of all current POIs (in relevant countries) including most delisted ones int[] countryIds = (from poi in cpList where poi.AddressInfo.Country != null select poi.AddressInfo.Country.ID).Distinct().ToArray(); APIRequestParams filters = new APIRequestParams { CountryIDs = countryIds, MaxResults = 1000000, EnableCaching = false, SubmissionStatusTypeID = 0 }; //List<ChargePoint> masterList = await new OCMClient(IsSandboxedAPIMode).GetLocations(filters); //new OCMClient().FindSimilar(null, 10000); //fetch all charge points regardless of status var poiManager = new POIManager(); List<ChargePoint> masterListCollection = poiManager.GetChargePoints(filters); //new OCMClient().FindSimilar(null, 10000); //fetch all charge points regardless of status var spec = new i4o.IndexSpecification<ChargePoint>() .Add(i => i.DataProviderID) .Add(i => i.DataProvidersReference) ; var masterList = new i4o.IndexSet<ChargePoint>(masterListCollection, spec); List<ChargePoint> masterListCopy = new List<ChargePoint>(); foreach (var tmp in masterList) { //fully copy of master list item so we have before/after masterListCopy.Add(JsonConvert.DeserializeObject<ChargePoint>(JsonConvert.SerializeObject(tmp))); } //if we failed to get a master list, quit with no result if (masterListCollection.Count == 0) return new List<ChargePoint>(); List<ChargePoint> duplicateList = new List<ChargePoint>(); List<ChargePoint> updateList = new List<ChargePoint>(); ChargePoint previousCP = null; //for each item to be imported, deduplicate by adding to updateList only the items which we don't already haves var cpListSortedByPos = cpList.OrderBy(c => c.AddressInfo.Latitude).ThenBy(c => c.AddressInfo.Longitude); int poiProcessed = 0; int totalPOI = cpListSortedByPos.Count(); Stopwatch dupeIdentWatch = new Stopwatch(); dupeIdentWatch.Start(); foreach (var item in cpListSortedByPos) { var itemGeoPos = new System.Device.Location.GeoCoordinate(item.AddressInfo.Latitude, item.AddressInfo.Longitude); //item is duplicate if we already seem to have it based on Data Providers reference or approx position match var dupeList = masterList.Where(c => (c.DataProvider != null && c.DataProviderID == item.DataProviderID && c.DataProvidersReference == item.DataProvidersReference) || (c.AddressInfo.Title == item.AddressInfo.Title && c.AddressInfo.AddressLine1 == item.AddressInfo.AddressLine1 && c.AddressInfo.Postcode == item.AddressInfo.Postcode) || (GeoManager.IsClose(c.AddressInfo.Latitude, c.AddressInfo.Longitude, item.AddressInfo.Latitude, item.AddressInfo.Longitude) && new System.Device.Location.GeoCoordinate(c.AddressInfo.Latitude, c.AddressInfo.Longitude).GetDistanceTo(itemGeoPos) < DUPLICATE_DISTANCE_METERS) //meters distance apart ); if (dupeList.Any()) { if (updateDuplicate) { //if updating duplicates, get exact matching duplicate based on provider reference and update/merge with this item to update status/merge properties var updatedItem = dupeList.FirstOrDefault(d => d.DataProviderID == (item.DataProvider != null ? item.DataProvider.ID : item.DataProviderID) && d.DataProvidersReference == item.DataProvidersReference); if (updatedItem != null) { //only merge/update from live published items if (updatedItem.SubmissionStatus.IsLive == (bool?)true || updatedItem.SubmissionStatus.ID == (int)StandardSubmissionStatusTypes.Delisted_RemovedByDataProvider || updatedItem.SubmissionStatus.ID == (int)StandardSubmissionStatusTypes.Delisted_NotPublicInformation) { //item is an exact match from same data provider //overwrite existing with imported data (use import as master) //updatedItem = poiManager.PreviewPopulatedPOIFromModel(updatedItem); MergeItemChanges(item, updatedItem, false); updateList.Add(updatedItem); } } if (updatedItem == null) { //duplicates are not exact match //TODO: resolve whether imported data should change duplicate //merge new properties from imported item //if (item.StatusType != null) updatedItem.StatusType = item.StatusType; //updateList.Add(updatedItem); } } //item has one or more likely duplicates, add it to list of items to remove duplicateList.Add(item); } //mark item as duplicate if location/title exactly matches previous entry or lat/long is within DuplicateDistance meters if (previousCP != null) { //this branch is the most expensive part of dedupe: if (IsDuplicateLocation(item, previousCP, true)) { if (!duplicateList.Contains(item)) { if (allowDupeWithDifferentOperator && item.OperatorID != previousCP.OperatorID) { Log("Duplicated allowed due to different operator:" + item.AddressInfo.Title); } else { Log("Duplicated item removed:" + item.AddressInfo.Title); duplicateList.Add(item); } } } } previousCP = item; poiProcessed++; if (poiProcessed % 300 == 0) { System.Diagnostics.Debug.WriteLine("Deduplication: " + poiProcessed + " processed of " + totalPOI); } } dupeIdentWatch.Stop(); Log("De-dupe pass took " + dupeIdentWatch.Elapsed.TotalSeconds + " seconds. " + (dupeIdentWatch.Elapsed.TotalMilliseconds / cpList.Count) + "ms per item."); //remove duplicates from list to apply foreach (var dupe in duplicateList) { cpList.Remove(dupe); } Log("Duplicates removed from import:" + duplicateList.Count); //add updated items (replace duplicates with property changes) foreach (var updatedItem in updateList) { if (!cpList.Contains(updatedItem)) { cpList.Add(updatedItem); } } Log("Updated items to import:" + updateList.Count); //populate missing location info from geolocation cache if possible Stopwatch geoWatch = new Stopwatch(); geoWatch.Start(); PopulateLocationFromGeolocationCache(cpList, coreRefData); geoWatch.Stop(); Log("Populate Country from Lat/Long took " + geoWatch.Elapsed.TotalSeconds + " seconds. " + (geoWatch.Elapsed.TotalMilliseconds / cpList.Count) + "ms per item."); //final pass to catch duplicates present in data source, mark additional items as Delisted Duplicate so we have a record for them var submissionStatusDelistedDupe = coreRefData.SubmissionStatusTypes.First(s => s.ID == 1001); //delisted duplicate previousCP = null; //sort current cp list by position again cpListSortedByPos = cpList.OrderBy(c => c.AddressInfo.Latitude).ThenBy(c => c.AddressInfo.Longitude); //mark any duplicates in final list as delisted duplicates (submitted to api) foreach (var cp in cpListSortedByPos) { bool isDuplicate = false; if (previousCP != null) { isDuplicate = IsDuplicateLocation(cp, previousCP, false); if (isDuplicate) { cp.SubmissionStatus = submissionStatusDelistedDupe; cp.SubmissionStatusTypeID = submissionStatusDelistedDupe.ID; if (previousCP.ID > 0) { if (cp.GeneralComments == null) cp.GeneralComments = ""; cp.GeneralComments += " [Duplicate of OCM-" + previousCP.ID + "]"; cp.ParentChargePointID = previousCP.ID; } } } if (!isDuplicate) { previousCP = cp; } } report.Added = cpListSortedByPos.Where(cp => cp.ID == 0).ToList(); report.Updated = cpListSortedByPos.Where(cp => cp.ID > 0).ToList(); report.Duplicates = duplicateList; //TODO: add additional pass of duplicates from above //determine which POIs in our master list are no longer referenced in the import report.Delisted = masterList.Where(cp => cp.DataProviderID == report.ProviderDetails.DataProviderID && cp.SubmissionStatus != null && (cp.SubmissionStatus.IsLive == true || cp.SubmissionStatusTypeID == (int)StandardSubmissionStatusTypes.Imported_UnderReview) && !cpListSortedByPos.Any(master => master.ID == cp.ID) && !report.Duplicates.Any(master => master.ID == cp.ID) && cp.UserComments == null && cp.MediaItems == null).ToList(); //safety check to ensure we're not delisting items just because we have incomplete import data: if (cpList.Count < 50)// || (report.Delisted.Count > cpList.Count)) { report.Delisted = new List<ChargePoint>(); } //determine list of low quality POIs (incomplete address info etc) report.LowDataQuality = new List<ChargePoint>(); report.LowDataQuality.AddRange(GetLowDataQualityPOIs(report.Added)); report.LowDataQuality.AddRange(GetLowDataQualityPOIs(report.Updated)); Log("Removing " + report.LowDataQuality.Count + " low quality POIs from added/updated"); //remove references in added/updated to any low quality POIs foreach (var p in report.LowDataQuality) { report.Added.Remove(p); } foreach (var p in report.LowDataQuality) { report.Updated.Remove(p); } //remove updates which only change datelaststatusupdate var updatesToIgnore = new List<ChargePoint>(); foreach (var poi in report.Updated) { var origPOI = masterListCopy.FirstOrDefault(p => p.ID == poi.ID); var updatedPOI = poiManager.PreviewPopulatedPOIFromModel(poi); var differences = poiManager.CheckDifferences(origPOI, updatedPOI); differences.RemoveAll(d => d.Context == ".MetadataValues"); differences.RemoveAll(d => d.Context == ".DateLastStatusUpdate"); differences.RemoveAll(d => d.Context == ".UUID"); differences.RemoveAll(d => d.Context == ".DataProvider.DateLastImported"); differences.RemoveAll(d => d.Context == ".IsRecentlyVerified"); differences.RemoveAll(d => d.Context == ".DateLastVerified"); differences.RemoveAll(d => d.Context == ".UserComments"); differences.RemoveAll(d => d.Context == ".MediaItems"); if (!differences.Any()) { updatesToIgnore.Add(poi); } else { //differences exist CompareLogic compareLogic = new CompareLogic(); compareLogic.Config.MaxDifferences = 100; compareLogic.Config.IgnoreObjectTypes = false; compareLogic.Config.IgnoreUnknownObjectTypes = true; compareLogic.Config.CompareChildren = true; ComparisonResult result = compareLogic.Compare(origPOI, updatedPOI); var diffReport = new KellermanSoftware.CompareNetObjects.Reports.UserFriendlyReport(); result.Differences.RemoveAll(d => d.PropertyName == ".MetadataValues"); result.Differences.RemoveAll(d => d.PropertyName == ".DateLastStatusUpdate"); result.Differences.RemoveAll(d => d.PropertyName == ".UUID"); result.Differences.RemoveAll(d => d.PropertyName == ".DataProvider.DateLastImported"); result.Differences.RemoveAll(d => d.PropertyName == ".IsRecentlyVerified"); result.Differences.RemoveAll(d => d.PropertyName == ".DateLastVerified"); result.Differences.RemoveAll(d => d.PropertyName == ".UserComments"); result.Differences.RemoveAll(d => d.PropertyName == ".MediaItems"); System.Diagnostics.Debug.WriteLine("Difference:" + diffReport.OutputString(result.Differences)); if (!result.Differences.Any()) { updatesToIgnore.Add(poi); } } } foreach (var p in updatesToIgnore) { if (report.Unchanged == null) report.Unchanged = new List<ChargePoint>(); report.Unchanged.Add(p); report.Updated.Remove(p); } //TODO: if POi is a duplicate ensure imported data provider reference/URL is included as reference metadata in OCM's version of the POI stopWatch.Stop(); Log("Deduplicate List took " + stopWatch.Elapsed.TotalSeconds + " seconds"); //return final processed list ready for applying as insert/updates return cpListSortedByPos.ToList(); }
public Model.EditQueueItem GetItemWithDifferences(Core.Data.EditQueueItem item, POIManager cpManager, bool loadCurrentItem) { var queueItem = Model.Extensions.EditQueueItem.FromDataModel(item); //get diff between previous and edit Model.ChargePoint poiA = DeserializePOIFromJSON(queueItem.PreviousData); if (loadCurrentItem && poiA != null) { poiA = new POIManager().Get(poiA.ID); } Model.ChargePoint poiB = DeserializePOIFromJSON(queueItem.EditData); queueItem.Differences = cpManager.CheckDifferences(poiA, poiB, useObjectCompare: true); return queueItem; }