private int StreamGenereateHashValue(DataStreamCompany company) { var companyName = company.company_name; var country = MatchCountries(company.country); var website = company.website ?? " "; var address = ""; if (company.offices != null && company.offices.Any()) { address = company.offices[0].address ?? ""; } var sme = company.sme_status.ToString(); var categories = ""; foreach (var item in company.supply_chain_categories) { categories = categories + item.Split(".")[1]; } var roles = ""; foreach (var item in company.supply_chain_roles) { roles = roles + item; } var totalString = ""; totalString = companyName + country + website + address + sme + categories + roles; var trimmedAndLowerInvariant = totalString.Trim().ToLowerInvariant(); var spaceRemoved = trimmedAndLowerInvariant.Replace(" ", ""); int hashValue = spaceRemoved.GetHashCode(); return(hashValue); }
public PotentialDiplicate(DataStreamCompany duplicate, DataStreamCompany match, APICompany match2) { this.Duplicate = duplicate; this.Match = match; this.Match2 = match2; }
/// <summary> /// First reduceses the datastream data by comparing it with itself and removing duplicates. /// Secund compares with apidata and again removes duplicates if stream data appears in api data. /// Special: If api data contains Reff key we know for sertain it originates from datastream /// wee want too keep those for update perpause. We dont consider the api company for matching if it contains reff key. /// </summary> /// <param name="dataStreamCompanies"></param> /// <param name="apiCompanies"></param> /// <returns></returns> private List <DataStreamCompany> FindPotentialDuplicates( List <DataStreamCompany> dataStreamCompanies, List <APICompany> apiCompanies) { var reducedDatastreamCompanies = new List <DataStreamCompany>(); var filteredDataStreamCompanies = new List <DataStreamCompany>(); foreach (var item in dataStreamCompanies) { var match = false; var matchItem = new DataStreamCompany(); //Check the item status. Skips the item if its not one of theses two. if (!item.status.Equals("Approved") && !item.status.Equals("Partner Added")) { continue; } //Filteres potential duplicated from datastreamcompanies via comparing it to itself. foreach (var x in dataStreamCompanies) { if (item.entry_reference_number.Equals(x.entry_reference_number)) { continue; } //if countries match wee will go to next step if (MatchCountries(x.country).Equals(MatchCountries(item.country))) { //If name is IN ANY WAY contined within another company we consider it a potential duplicate. //If true, Item is added to reduced and potentialduplicates. var xname = x.company_name.Trim().ToLowerInvariant(); var itemname = item.company_name.Trim().ToLowerInvariant(); if (xname.Contains(itemname)) { match = true; matchItem = x; break; } else { match = false; } } else { match = false; } } if (match == false) { reducedDatastreamCompanies.Add(item); } else { this._datastreamPotentialDuplicates.Add(new PotentialDiplicate(item, matchItem, null)); } } //filteres portential duplicates away by comparing the reduced list with companies from the API. //If y has Reff key. It origins from datasteam and we add it to filtered it because its difinitive the same and we might want to update it. //If y has no reff key and x does not appear in apiCompanies we add it to reduced reducedDatastreamCompanies.ForEach(x => { var xHash = StreamGenereateHashValue(x); var match = false; var skip = false; var matchItem = new APICompany(); foreach (var y in apiCompanies) { if (y.CompanyDirectoryEntryReffNumber == null || y.CompanyDirectoryEntryReffNumber.Equals("")) { if (x.company_name.ToLowerInvariant() == y.CompanyName.ToLowerInvariant()) { var hit = true; } //checks for exact match var yHash = APIGenereateHashValue(y); if (xHash == yHash) { y.CompanyDirectoryEntryReffNumber = x.entry_reference_number; this._exactHashMatch.Add(y); skip = true; break; } //if countries match wee will go to next step if (MatchCountries(x.country).Equals(MatchCountries(y.Country))) { //If name is IN ANY WAY contined within another company we consider it a potential duplicate. //If true, Item is added to reduced and potentialduplicates. var xname = x.company_name.Trim().ToLowerInvariant(); var yname = y.CompanyName.Trim().ToLowerInvariant(); if (xname.Contains(yname)) { match = true; matchItem = y; break; } else { match = false; } } else { match = false; } } else if (x.entry_reference_number.Equals(y.CompanyDirectoryEntryReffNumber)) { match = false; } } if (!skip)//we skip this if we have an exact math on hash value. { if (match == false) { filteredDataStreamCompanies.Add(x); } else { this._datastreamPotentialDuplicates.Add(new PotentialDiplicate(x, null, matchItem)); } } }); return(filteredDataStreamCompanies); }