static void CreateBKTree(string outputSerializedPath) { List <string> allLines = File.ReadAllLines("c:/users/brush/desktop/allStreets.csv").ToList(); Data data = DataLoader.LoadData(regenerateBKTree: true); //List<string> allCorrectedStrings = new List<string>(); //string[] allLines = File.ReadAllLines("C:/users/brush/desktop/allStreets.csv"); //Dictionary<string, List<string>> streetNamesWithMultipleSuffixes = new Dictionary<string, List<string>>(); List <string> justNames = new List <string>(); foreach (string line in allLines) { string[] parts = line.Trim().Split(' '); string suffix = ""; int index = -1; if ((index = data.Suffixes.IndexOf(parts[parts.Length - 1])) != -1) { justNames.Add(string.Join(" ", parts.Take(parts.Length - 1))); } else { justNames.Add(line); } } BKTree tree = BKTreeEngine.CreateBKTree(justNames); BKTreeSerializer.SerializeTo(tree, outputSerializedPath); }
public static bool IsSoftAddressMatch(Address address, Data data, List <string> alternateLines) { List <string> innerAlternateLines = new List <string>(); if (!string.IsNullOrEmpty(address.StreetName)) { List <string> softMatchedStreets = BKTreeEngine.LeastEditDistance(address.StreetName, data.StreetNameBKTree).Distinct().ToList(); StreetName[] possibleStreets = data.StreetData.Where(n => softMatchedStreets.Contains(n.Name)).ToArray(); if (softMatchedStreets.Count > 0 && EditDistanceEngine.ComputeNormalized(softMatchedStreets[0], address.StreetName) < .5f) { foreach (StreetName name in possibleStreets) { if (!string.IsNullOrEmpty(name.Name)) { foreach (string softMatchedStreet in softMatchedStreets) { if (softMatchedStreet == name.Name) { string[] possibleCities = name.Cities.ToArray(); foreach (string city in possibleCities) { int[] possibleZips = data.StreetNameCity2Zips[ new StreetNameAndCity { City = city, FullStreetName = name.FullStreetName, }].ToArray(); foreach (int possibleZip in possibleZips) { innerAlternateLines.Add(AddressUtility.CreateLineFromAddress(address, name.FullStreetName, possibleZip, city)); } } } } } } } lock (alternateLines) { alternateLines.AddRange(innerAlternateLines); } return(innerAlternateLines.Count > 0); } return(false); }
private static string TestAddress(Data data, ref Address address, string stringToTest, int maxEditDistance) { int distance; List <string> closestNeighbors = BKTreeEngine.LeastEditDistanceWithDistance(stringToTest, data.StreetNameBKTree, out distance); if (distance <= maxEditDistance) { return(CorrectAddress(data, ref address, closestNeighbors[0])); } else { return(null); } }
public static bool IsSoftAddressAndSoftCityHardZipMatch(Address address, Data data, List <string> alternateLines) { if (!string.IsNullOrEmpty(address.StreetName) && address.Zip != null && !string.IsNullOrEmpty(address.City)) { List <string> innerAlternateLines = new List <string>(); List <string> softMatchedCities = BKTreeEngine.LeastEditDistance(address.City, data.CityNameBKTree).Distinct().ToList(); List <string> softMatchedStreets = BKTreeEngine.LeastEditDistance(address.StreetName, data.StreetNameBKTree).Distinct().ToList(); foreach (StreetName name in data.StreetData) { if (!string.IsNullOrEmpty(name.Name) && name.ZipCodes.Contains(address.Zip.Value)) { foreach (string softMatchedCity in softMatchedCities) { if (data.StreetNameZip2Cities[new StreetNameAndZip { Zip = address.Zip.Value, FullStreetName = name.FullStreetName, }].Contains(softMatchedCity)) { if (name.Cities.Contains(softMatchedCity)) { foreach (string softMatchedStreet in softMatchedStreets) { if (softMatchedStreet == name.Name) { innerAlternateLines.Add(AddressUtility.CreateLineFromAddress(address, name.FullStreetName, address.Zip, softMatchedCity)); } } } } } } } lock (alternateLines) { alternateLines.AddRange(innerAlternateLines); } return(innerAlternateLines.Count > 0); } return(false); }
public static bool IsSoftAddressAndSoftCityMatch(Address address, Data data, List <string> alternateLines) { List <string> innerAlternateLines = new List <string>(); if (!string.IsNullOrEmpty(address.StreetName) && !string.IsNullOrEmpty(address.City)) { List <string> softMatchedCities = BKTreeEngine.LeastEditDistance(address.City, data.CityNameBKTree).Distinct().ToList(); List <string> softMatchedStreets = BKTreeEngine.LeastEditDistance(address.StreetName, data.StreetNameBKTree).Distinct().ToList(); foreach (StreetName name in data.StreetData) { if (!string.IsNullOrEmpty(name.Name)) { foreach (string softMatchedCity in softMatchedCities) { if (name.Cities.Contains(softMatchedCity)) { foreach (string softMatchedStreet in softMatchedStreets) { if (softMatchedStreet == name.Name) { int[] zipsForCityAndStreet = data.StreetNameCity2Zips[new StreetNameAndCity { City = softMatchedCity, FullStreetName = name.FullStreetName, }].ToArray(); foreach (int zip in zipsForCityAndStreet) { innerAlternateLines.Add(AddressUtility.CreateLineFromAddress(address, name.FullStreetName, zip, softMatchedCity)); } } } } } } } } lock (alternateLines) { alternateLines.AddRange(innerAlternateLines); } return(innerAlternateLines.Count > 0); }
public static bool IsRearrangedAddressAndSoftCityMatch(Address address, Data data, List <string> alternateLines) { if (!string.IsNullOrEmpty(address.StreetName) && !string.IsNullOrEmpty(address.City)) { List <string> softMatchedCities = BKTreeEngine.LeastEditDistance(address.City, data.CityNameBKTree).Distinct().ToList(); StreetName bestMatch = null; int highestMatchNumber = 0; string bestSoftMatchedCity = null; foreach (StreetName name in data.StreetData) { foreach (string softMatchedCity in softMatchedCities) { // demand the stem and city/zip are matches if (!string.IsNullOrEmpty(name.Name) && name.Cities.Contains(softMatchedCity) && StringUtility.Contains(address.FullStreetName, name.Name)) { // demand the suffix, somewhere (if it exists) List <string> partsToCheck = new List <string>(); // pretype if exists. if (!string.IsNullOrEmpty(name.PreType)) { partsToCheck.Add(name.PreType); } if (!string.IsNullOrEmpty(name.Suffix)) { partsToCheck.Add(name.Suffix); } int matchNumber = 0; if (!string.IsNullOrEmpty(name.PreDirection)) { if (name.PreDirection == "E" && address.CardinalDirection == "EAST") { matchNumber++; } else if (name.PreDirection == "W" && address.CardinalDirection == "WEST") { matchNumber++; } else if (name.PreDirection == "S" && address.CardinalDirection == "SOUTH") { matchNumber++; } else if (name.PreDirection == "N" && address.CardinalDirection == "NORTH") { matchNumber++; } } string fullName = string.Join(" ", address.StreetNumber, address.StreetName, address.Suffix); foreach (string partToCheck in partsToCheck) { if (StringUtility.Contains(fullName, partToCheck)) { matchNumber++; } } if (matchNumber > highestMatchNumber) { highestMatchNumber = matchNumber; bestMatch = name; bestSoftMatchedCity = softMatchedCity; } } } } if (highestMatchNumber > 0) { // given the city + street, what are the available zips? int[] possibleZips = data.StreetNameCity2Zips[new StreetNameAndCity { City = bestSoftMatchedCity, FullStreetName = bestMatch.FullStreetName }].ToArray(); lock (alternateLines) { foreach (int possibleZip in possibleZips) { alternateLines.Add(AddressUtility.CreateLineFromAddress(address, bestMatch.FullStreetName, possibleZip, bestSoftMatchedCity)); } } return(true); } } return(false); }
public static List <StreetName> Generate(string filePath) { Data data = DataLoader.LoadJustSuffixes(); Dictionary <StreetName, List <int> > zipCodes = new Dictionary <StreetName, List <int> >(); Dictionary <StreetName, List <string> > cities = new Dictionary <StreetName, List <string> >(); Dictionary <StreetNameAndCity, List <int> > streetNameCity2Zips = new Dictionary <StreetNameAndCity, List <int> >(); Dictionary <StreetNameAndZip, List <string> > streetNameZip2Cities = new Dictionary <StreetNameAndZip, List <string> >(); const int PreTypeColumn = 9; const int StreetNameColumn = 11; const int StreetSuffixColumn = 12; const int ZipLeftColumn = 33; const int ZipRightColumn = 34; const int CityLeftColumn = 35; const int CityRightColumn = 36; const int CityLeftAlternate = 37; const int CityRightAlternate = 38; const int PreDirectionColumn = 8; string[] allLines = File.ReadAllLines(filePath).Skip(1).ToArray(); Parallel.ForEach(allLines, line => { string[] lineBits = line.Split(',').Select(n => n.Trim()).ToArray(); string preType = lineBits[PreTypeColumn].ToUpper(); string streetName = lineBits[StreetNameColumn].ToUpper(); if (streetName != "DRIVEWAY" && !streetName.Contains("UNNAMED")) { string streetSuffix = lineBits[StreetSuffixColumn].ToUpper(); for (int c = 0; c < data.Suffixes.LongSuffixes.Length; c++) { if (data.Suffixes.LongSuffixes[c] == streetSuffix) { streetSuffix = data.Suffixes.ShortSuffixes[c]; } } int zipLeft = 0, zipRight = 0; int.TryParse(lineBits[ZipLeftColumn], out zipLeft); int.TryParse(lineBits[ZipRightColumn], out zipRight); string cityLeft = lineBits[CityLeftColumn].ToUpper(); string cityRight = lineBits[CityRightColumn].ToUpper(); if (string.IsNullOrEmpty(cityLeft)) { cityLeft = lineBits[CityLeftAlternate].ToUpper(); } if (string.IsNullOrEmpty(cityRight)) { cityRight = lineBits[CityRightAlternate].ToUpper(); } string preDirection = lineBits[PreDirectionColumn].ToUpper(); if (preDirection == "E") { preDirection = "EAST"; } else if (preDirection == "W") { preDirection = "WEST"; } else if (preDirection == "N") { preDirection = "NORTH"; } else if (preDirection == "S") { preDirection = "SOUTH"; } string cleanedName = streetName; cleanedName = Regex.Replace(cleanedName, @"(\d+)(TH|ST|ND|RD)", "$1"); StreetName name = new StreetName(preDirection, preType, cleanedName, streetSuffix, null, null); List <int> localZips = new List <int>(); if (zipLeft != 0) { localZips.Add(zipLeft); } if (zipRight != 0) { localZips.Add(zipRight); } List <string> localCities = new List <string>(); if (!string.IsNullOrEmpty(cityLeft)) { localCities.Add(cityLeft); } if (!string.IsNullOrEmpty(cityRight)) { localCities.Add(cityRight); } lock (streetNameCity2Zips) { string fullStreetName = Regex.Replace(name.FullStreetName, @"(\d+)(TH|ST|ND|RD)", "$1"); if (zipLeft != 0 && !string.IsNullOrEmpty(cityLeft)) { StreetNameAndCity key1 = new StreetNameAndCity { City = cityLeft, FullStreetName = fullStreetName, }; if (!streetNameCity2Zips.ContainsKey(key1)) { streetNameCity2Zips.Add(key1, new List <int>()); } streetNameCity2Zips[key1].Add(zipLeft); streetNameCity2Zips[key1] = streetNameCity2Zips[key1].Distinct().ToList(); StreetNameAndZip key2 = new StreetNameAndZip { FullStreetName = fullStreetName, Zip = zipLeft, }; if (!streetNameZip2Cities.ContainsKey(key2)) { streetNameZip2Cities.Add(key2, new List <string>()); } streetNameZip2Cities[key2].Add(cityLeft); streetNameZip2Cities[key2] = streetNameZip2Cities[key2].Distinct().ToList(); } if (zipRight != 0 && !string.IsNullOrEmpty(cityRight)) { StreetNameAndCity key1 = new StreetNameAndCity { City = cityRight, FullStreetName = fullStreetName, }; if (!streetNameCity2Zips.ContainsKey(key1)) { streetNameCity2Zips.Add(key1, new List <int>()); } streetNameCity2Zips[key1].Add(zipRight); streetNameCity2Zips[key1] = streetNameCity2Zips[key1].Distinct().ToList(); StreetNameAndZip key2 = new StreetNameAndZip { FullStreetName = fullStreetName, Zip = zipRight, }; if (!streetNameZip2Cities.ContainsKey(key2)) { streetNameZip2Cities.Add(key2, new List <string>()); } streetNameZip2Cities[key2].Add(cityRight); streetNameZip2Cities[key2] = streetNameZip2Cities[key2].Distinct().ToList(); } } lock (zipCodes) { if (!zipCodes.ContainsKey(name)) { zipCodes.Add(name, new List <int>()); } if (zipLeft != 0) { zipCodes[name].Add(zipLeft); } if (zipRight != 0 && zipLeft != zipRight) { zipCodes[name].Add(zipRight); } } lock (cities) { if (!cities.ContainsKey(name)) { cities.Add(name, new List <string>()); } if (!string.IsNullOrEmpty(cityLeft)) { cities[name].Add(cityLeft); } if (!string.IsNullOrEmpty(cityRight) && cityRight != cityLeft) { cities[name].Add(cityRight); } } } }); List <StreetName> allStreetNames = new List <StreetName>(); StreetName[] keys = zipCodes.Keys.ToArray(); foreach (StreetName key in keys) { StreetName newStreetName = new StreetName(key.PreDirection, key.PreType, key.Name, key.Suffix, zipCodes[key].Distinct().ToList(), cities[key].Distinct().ToList()); allStreetNames.Add(newStreetName); } BinaryFormatter bf = new BinaryFormatter(); using (FileStream sw = File.Create("c:/users/brush/desktop/streetNames.dat")) { bf.Serialize(sw, allStreetNames); } string[] uniqueCities = allStreetNames.SelectMany(n => n.Cities).Distinct().ToArray(); File.WriteAllLines("C:/users/brush/desktop/knownCities.csv", uniqueCities); string[] uniqueStreets = allStreetNames.Select(n => n.Name).Distinct().ToArray(); File.WriteAllLines("C:/users/brush/desktop/knownStreets.csv", uniqueStreets); BKTree citiesTree = BKTreeEngine.CreateBKTree(uniqueCities.ToList()); BKTreeSerializer.SerializeTo(citiesTree, "c:/users/brush/desktop/citiesBKTree.dat"); BKTree streetsTree = BKTreeEngine.CreateBKTree(uniqueStreets.ToList()); BKTreeSerializer.SerializeTo(streetsTree, "c:/users/brush/desktop/streetsBKTree.dat"); bf = new BinaryFormatter(); using (FileStream fw = File.Create("C:/users/brush/desktop/streetNameCity2Zips.dat")) { bf.Serialize(fw, streetNameCity2Zips); } bf = new BinaryFormatter(); using (FileStream fw = File.Create("C:/users/brush/desktop/streetNameZip2Cities.dat")) { bf.Serialize(fw, streetNameZip2Cities); } return(allStreetNames); }
private static Address LucasAddressMatch(string line, Data data) { string matched = line; Address address = AddressUtility.InitializeAddress(line, data); bool matchFound = (address.MatchQuality != MatchQuality.MatchNotYetDetermined); if (matchFound) { level1Match.Add(address.FullStreetName); } if (!matchFound) { matched = TestAddress(data, ref address, address.FullStreetName, 0); matchFound = (matched != null); if (matchFound) { level2Match.Add(address.FullStreetName); } } if (!matchFound) { matched = TestAddress(data, ref address, address.FullStreetName + " ST", 0); matchFound = (matched != null); if (matchFound) { level3Match.Add(address.FullStreetName); } } if (!matchFound) { matched = TestAddress(data, ref address, address.FullStreetName + " AVE", 0); matchFound = (matched != null); if (matchFound) { level3Match.Add(address.FullStreetName); } } if (!matchFound) { matched = TestAddress(data, ref address, address.FullStreetName + " BLVD", 0); matchFound = (matched != null); if (matchFound) { level3Match.Add(address.FullStreetName); } } if (!matchFound) { List <string> closestNeighbors; int distance; // Search by ZIP if (address.Zip != null && !string.IsNullOrEmpty(address.StreetName) && !string.IsNullOrEmpty(address.StreetNumber)) { StateOfNewYorkAddressRange[] streetsWithZip = data.NYCityStreets.Where(n => n.StreetNumber.IsInRange(address.StreetNumber) && n.ZipCode == address.Zip.Value).ToArray(); List <string> streetsWithZipStrings = streetsWithZip.Select(s => s.FullStreetName).Distinct().ToList(); BKTree bkTreeLocal = BKTreeEngine.CreateBKTree(streetsWithZipStrings); closestNeighbors = BKTreeEngine.LeastEditDistanceWithDistance(address.FullStreetName, bkTreeLocal, out distance); if (closestNeighbors.Count == 1 && distance <= 1) { matched = CorrectAddress(data, ref address, closestNeighbors[0]); matchFound = true; level5Match.Add(address.FullStreetName); } else if (closestNeighbors.Count == 1 && distance <= 2) { matched = CorrectAddress(data, ref address, closestNeighbors[0]); matchFound = true; level6Match.Add(address.FullStreetName); } } } if (!matchFound) { matched = TestAddress(data, ref address, address.FullStreetName, 2); matchFound = (matched != null); if (matchFound) { level7Match.Add(address.FullStreetName); } } if (!matchFound) { matched = TestAddress(data, ref address, address.FullStreetName + " ST", 2); matchFound = (matched != null); if (matchFound) { level8Match.Add(address.FullStreetName); } } if (!matchFound) { matched = TestAddress(data, ref address, address.FullStreetName + " AVE", 2); matchFound = (matched != null); if (matchFound) { level8Match.Add(address.FullStreetName); } } if (!matchFound) { matched = TestAddress(data, ref address, address.FullStreetName + " BLVD", 2); matchFound = (matched != null); if (matchFound) { level8Match.Add(address.FullStreetName); } } // Debug if (!matchFound) { if (false) { string addressRaw = $"{address.RawAddress1} / {address.RawAddress2}"; string addressCleaned = $"{ address.StreetNumber } / { address.StreetName} / { address.Suffix}"; if (!string.IsNullOrEmpty(address.ApartmentNumber)) { addressCleaned += $" / {address.ApartmentNumber}"; } string closestNeighborsConcatenated = string.Join(" OR ", BKTreeEngine.LeastEditDistance(address.FullStreetName, data.StreetNameBKTree)); Console.WriteLine($"{addressRaw} => {addressCleaned} => {closestNeighborsConcatenated}"); } } if (address.MatchQuality == MatchQuality.Unknown) { lock (unknown) { unknown.Add(AddressUtility.CreateLineFromAddress(address, "UNKNOWN")); } } else if (address.MatchQuality == MatchQuality.Homeless) { lock (homeless) { homeless.Add(AddressUtility.CreateLineFromAddress(address, "HOMELESS")); } } else if (address.MatchQuality == MatchQuality.Alternate) { lock (alternate) { alternate.Add(address.RawAddress1); } } else if (address.MatchQuality == MatchQuality.LeaveAlone) { lock (leaveAlone) { leaveAlone.Add(address.RawAddress1); } } else if (address.MatchQuality == MatchQuality.MatchNotYetDetermined) { lock (matchNotYetDetermined) { matchNotYetDetermined.Add($"{address.RawAddress1}=>{address.FullStreetName}"); } } return(address); }
public static Data LoadData(bool regenerateBKTree) { Data data = new Data(); // RawData data.FinalDataSet = FileLibrary.GetLines().Skip(1).Where(l => l != ",,,,,,,,,,,,,,,,,,").ToArray(); // Suffixes data.Suffixes = new AddressSuffixes(); string[] streetSuffixLines = File.ReadAllLines(StreetSuffixesPath); data.Suffixes.ShortSuffixes = streetSuffixLines.Select(n => n.Split(',')[1]).ToArray(); data.Suffixes.LongSuffixes = streetSuffixLines.Select(n => n.Split(',')[0]).ToArray(); // Unknown and Homeless data.UnknownAddresses = File.ReadAllLines("UnknownAddresses.csv"); data.HomelessAddresses = File.ReadAllLines("HomelessAddresses.csv"); // Abbreviations data.Abbreviations = new Dictionary <string, string>(); string[] nameValuePairs = File.ReadAllLines("Abbreviations.txt"); foreach (string nameValuePair in nameValuePairs) { string[] bits = nameValuePair.Split(',').Select(n => n.Trim()).ToArray(); data.Abbreviations.Add(bits[0], bits[1]); } data.AbbreviationsShortened = new Dictionary <string, string>(); nameValuePairs = File.ReadAllLines("AbbreviationsShortened.txt"); foreach (string nameValuePair in nameValuePairs) { string[] bits = nameValuePair.Split(',').Select(n => n.Trim()).ToArray(); data.AbbreviationsShortened.Add(bits[0], bits[1]); } // SuffixReplacementKey nameValuePairs = File.ReadAllLines("SuffixReplacementKey.txt"); data.SuffixReplacementKey = new Dictionary <string, string>(); foreach (string nameValuePair in nameValuePairs) { string[] bits = nameValuePair.Split(',').Select(n => n.Trim()).ToArray(); if (!data.SuffixReplacementKey.ContainsKey(bits[0])) { data.SuffixReplacementKey.Add(bits[0], bits[1]); } } // KnownCenters nameValuePairs = File.ReadAllLines("KnownCenters.txt"); data.KnownCenters = new Dictionary <string, Address>(); foreach (string nameValuePair in nameValuePairs) { string[] bits = nameValuePair.Split(';').Select(n => n.Trim()).ToArray(); string[] rhsAddressParts = bits[1].Split(',').Select(n => n.Trim()).ToArray(); Address address = new Address { CenterName = rhsAddressParts[0], StreetNumber = rhsAddressParts[1], StreetName = rhsAddressParts[2], City = rhsAddressParts[3], State = rhsAddressParts[4], }; if (rhsAddressParts[5].Length > 0) { address.Zip = int.Parse(rhsAddressParts[5]); } address.FullStreetName = (address.StreetNumber != "" ? $"{address.StreetNumber} {address.StreetName}" : address.StreetName); data.KnownCenters.Add(bits[0], address); } // AlternateSuffixList string[] lines = File.ReadAllLines("streetToSuffixTable.txt"); data.AlternateSuffixList = new Dictionary <string, List <string> >(); foreach (string line in lines) { string[] halves = line.Split(':'); List <string> alternates = halves[1].Split(',').ToList(); data.AlternateSuffixList.Add(halves[0], alternates); } BinaryFormatter bf = new BinaryFormatter(); using (FileStream fin = File.OpenRead("streetNames.dat")) { data.StreetData = (List <StreetName>)bf.Deserialize(fin); } data.NYCityStreets = LoadNYCityAddresses(data); data.NYStateStreets = LoadNYStateStreets(data); // BKTree if (regenerateBKTree) { data.StreetNameBKTree = BKTreeEngine.CreateBKTree(data.NYStateStreets.ToList()); //BKTreeSerializer.SerializeTo(data.BKTree, "bkTree.dat"); } else { data.StreetNameBKTree = BKTreeSerializer.DeserializeFrom("bkTree.dat"); } data.CityNameBKTree = BKTreeSerializer.DeserializeFrom("citiesBKTree.dat"); data.KnownCities = new List <string>(File.ReadAllLines("knownCities.csv")); return(data); }