Esempio n. 1
0
        private int StreamGenereateHashValue(DataStreamCompany company)
        {
            var companyName = company.company_name;
            var country     = MatchCountries(company.country);
            var website     = company.website ?? " ";
            var address     = "";

            if (company.offices != null && company.offices.Any())
            {
                address = company.offices[0].address ?? "";
            }
            var sme        = company.sme_status.ToString();
            var categories = "";

            foreach (var item in company.supply_chain_categories)
            {
                categories = categories + item.Split(".")[1];
            }
            var roles = "";

            foreach (var item in company.supply_chain_roles)
            {
                roles = roles + item;
            }

            var totalString = "";

            totalString = companyName + country + website + address + sme + categories + roles;
            var trimmedAndLowerInvariant = totalString.Trim().ToLowerInvariant();
            var spaceRemoved             = trimmedAndLowerInvariant.Replace(" ", "");

            int hashValue = spaceRemoved.GetHashCode();

            return(hashValue);
        }
 public PotentialDiplicate(DataStreamCompany duplicate, DataStreamCompany match, APICompany match2)
 {
     this.Duplicate = duplicate;
     this.Match     = match;
     this.Match2    = match2;
 }
Esempio n. 3
0
        /// <summary>
        /// First reduceses the datastream data by comparing it with itself and removing duplicates.
        /// Secund compares with apidata and again removes duplicates if stream data appears in api data.
        /// Special: If api data contains Reff key we know for sertain it originates from datastream
        /// wee want too keep those for update perpause. We dont consider the api company for matching if it contains reff key.
        /// </summary>
        /// <param name="dataStreamCompanies"></param>
        /// <param name="apiCompanies"></param>
        /// <returns></returns>
        private List <DataStreamCompany> FindPotentialDuplicates(
            List <DataStreamCompany> dataStreamCompanies,
            List <APICompany> apiCompanies)
        {
            var reducedDatastreamCompanies  = new List <DataStreamCompany>();
            var filteredDataStreamCompanies = new List <DataStreamCompany>();

            foreach (var item in dataStreamCompanies)
            {
                var match     = false;
                var matchItem = new DataStreamCompany();
                //Check the item status. Skips the item if its not one of theses two.
                if (!item.status.Equals("Approved") && !item.status.Equals("Partner Added"))
                {
                    continue;
                }
                //Filteres potential duplicated from datastreamcompanies via comparing it to itself.
                foreach (var x in dataStreamCompanies)
                {
                    if (item.entry_reference_number.Equals(x.entry_reference_number))
                    {
                        continue;
                    }
                    //if countries match wee will go to next step
                    if (MatchCountries(x.country).Equals(MatchCountries(item.country)))
                    {
                        //If name is IN ANY WAY contined within another company we consider it a potential duplicate.
                        //If true, Item is added to reduced and potentialduplicates.
                        var xname    = x.company_name.Trim().ToLowerInvariant();
                        var itemname = item.company_name.Trim().ToLowerInvariant();
                        if (xname.Contains(itemname))
                        {
                            match     = true;
                            matchItem = x;
                            break;
                        }
                        else
                        {
                            match = false;
                        }
                    }
                    else
                    {
                        match = false;
                    }
                }
                if (match == false)
                {
                    reducedDatastreamCompanies.Add(item);
                }
                else
                {
                    this._datastreamPotentialDuplicates.Add(new PotentialDiplicate(item, matchItem, null));
                }
            }
            //filteres portential duplicates away by comparing the reduced list with companies from the API.
            //If y has Reff key. It origins from datasteam and we add it to filtered it because its difinitive the same and we might want to update it.
            //If y has no reff key and x does not appear in apiCompanies we add it to reduced
            reducedDatastreamCompanies.ForEach(x =>
            {
                var xHash     = StreamGenereateHashValue(x);
                var match     = false;
                var skip      = false;
                var matchItem = new APICompany();
                foreach (var y in apiCompanies)
                {
                    if (y.CompanyDirectoryEntryReffNumber == null || y.CompanyDirectoryEntryReffNumber.Equals(""))
                    {
                        if (x.company_name.ToLowerInvariant() == y.CompanyName.ToLowerInvariant())
                        {
                            var hit = true;
                        }
                        //checks for exact match
                        var yHash = APIGenereateHashValue(y);
                        if (xHash == yHash)
                        {
                            y.CompanyDirectoryEntryReffNumber = x.entry_reference_number;
                            this._exactHashMatch.Add(y);
                            skip = true;
                            break;
                        }

                        //if countries match wee will go to next step
                        if (MatchCountries(x.country).Equals(MatchCountries(y.Country)))
                        {
                            //If name is IN ANY WAY contined within another company we consider it a potential duplicate.
                            //If true, Item is added to reduced and potentialduplicates.
                            var xname = x.company_name.Trim().ToLowerInvariant();
                            var yname = y.CompanyName.Trim().ToLowerInvariant();
                            if (xname.Contains(yname))
                            {
                                match     = true;
                                matchItem = y;
                                break;
                            }
                            else
                            {
                                match = false;
                            }
                        }
                        else
                        {
                            match = false;
                        }
                    }
                    else if (x.entry_reference_number.Equals(y.CompanyDirectoryEntryReffNumber))
                    {
                        match = false;
                    }
                }

                if (!skip)//we skip this if we have an exact math on hash value.
                {
                    if (match == false)
                    {
                        filteredDataStreamCompanies.Add(x);
                    }
                    else
                    {
                        this._datastreamPotentialDuplicates.Add(new PotentialDiplicate(x, null, matchItem));
                    }
                }
            });
            return(filteredDataStreamCompanies);
        }