static void CreateBKTree(string outputSerializedPath) { List <string> allLines = File.ReadAllLines("c:/users/brush/desktop/allStreets.csv").ToList(); Data data = DataLoader.LoadData(regenerateBKTree: true); //List<string> allCorrectedStrings = new List<string>(); //string[] allLines = File.ReadAllLines("C:/users/brush/desktop/allStreets.csv"); //Dictionary<string, List<string>> streetNamesWithMultipleSuffixes = new Dictionary<string, List<string>>(); List <string> justNames = new List <string>(); foreach (string line in allLines) { string[] parts = line.Trim().Split(' '); string suffix = ""; int index = -1; if ((index = data.Suffixes.IndexOf(parts[parts.Length - 1])) != -1) { justNames.Add(string.Join(" ", parts.Take(parts.Length - 1))); } else { justNames.Add(line); } } BKTree tree = BKTreeEngine.CreateBKTree(justNames); BKTreeSerializer.SerializeTo(tree, outputSerializedPath); }
public static List <StreetName> Generate(string filePath) { Data data = DataLoader.LoadJustSuffixes(); Dictionary <StreetName, List <int> > zipCodes = new Dictionary <StreetName, List <int> >(); Dictionary <StreetName, List <string> > cities = new Dictionary <StreetName, List <string> >(); Dictionary <StreetNameAndCity, List <int> > streetNameCity2Zips = new Dictionary <StreetNameAndCity, List <int> >(); Dictionary <StreetNameAndZip, List <string> > streetNameZip2Cities = new Dictionary <StreetNameAndZip, List <string> >(); const int PreTypeColumn = 9; const int StreetNameColumn = 11; const int StreetSuffixColumn = 12; const int ZipLeftColumn = 33; const int ZipRightColumn = 34; const int CityLeftColumn = 35; const int CityRightColumn = 36; const int CityLeftAlternate = 37; const int CityRightAlternate = 38; const int PreDirectionColumn = 8; string[] allLines = File.ReadAllLines(filePath).Skip(1).ToArray(); Parallel.ForEach(allLines, line => { string[] lineBits = line.Split(',').Select(n => n.Trim()).ToArray(); string preType = lineBits[PreTypeColumn].ToUpper(); string streetName = lineBits[StreetNameColumn].ToUpper(); if (streetName != "DRIVEWAY" && !streetName.Contains("UNNAMED")) { string streetSuffix = lineBits[StreetSuffixColumn].ToUpper(); for (int c = 0; c < data.Suffixes.LongSuffixes.Length; c++) { if (data.Suffixes.LongSuffixes[c] == streetSuffix) { streetSuffix = data.Suffixes.ShortSuffixes[c]; } } int zipLeft = 0, zipRight = 0; int.TryParse(lineBits[ZipLeftColumn], out zipLeft); int.TryParse(lineBits[ZipRightColumn], out zipRight); string cityLeft = lineBits[CityLeftColumn].ToUpper(); string cityRight = lineBits[CityRightColumn].ToUpper(); if (string.IsNullOrEmpty(cityLeft)) { cityLeft = lineBits[CityLeftAlternate].ToUpper(); } if (string.IsNullOrEmpty(cityRight)) { cityRight = lineBits[CityRightAlternate].ToUpper(); } string preDirection = lineBits[PreDirectionColumn].ToUpper(); if (preDirection == "E") { preDirection = "EAST"; } else if (preDirection == "W") { preDirection = "WEST"; } else if (preDirection == "N") { preDirection = "NORTH"; } else if (preDirection == "S") { preDirection = "SOUTH"; } string cleanedName = streetName; cleanedName = Regex.Replace(cleanedName, @"(\d+)(TH|ST|ND|RD)", "$1"); StreetName name = new StreetName(preDirection, preType, cleanedName, streetSuffix, null, null); List <int> localZips = new List <int>(); if (zipLeft != 0) { localZips.Add(zipLeft); } if (zipRight != 0) { localZips.Add(zipRight); } List <string> localCities = new List <string>(); if (!string.IsNullOrEmpty(cityLeft)) { localCities.Add(cityLeft); } if (!string.IsNullOrEmpty(cityRight)) { localCities.Add(cityRight); } lock (streetNameCity2Zips) { string fullStreetName = Regex.Replace(name.FullStreetName, @"(\d+)(TH|ST|ND|RD)", "$1"); if (zipLeft != 0 && !string.IsNullOrEmpty(cityLeft)) { StreetNameAndCity key1 = new StreetNameAndCity { City = cityLeft, FullStreetName = fullStreetName, }; if (!streetNameCity2Zips.ContainsKey(key1)) { streetNameCity2Zips.Add(key1, new List <int>()); } streetNameCity2Zips[key1].Add(zipLeft); streetNameCity2Zips[key1] = streetNameCity2Zips[key1].Distinct().ToList(); StreetNameAndZip key2 = new StreetNameAndZip { FullStreetName = fullStreetName, Zip = zipLeft, }; if (!streetNameZip2Cities.ContainsKey(key2)) { streetNameZip2Cities.Add(key2, new List <string>()); } streetNameZip2Cities[key2].Add(cityLeft); streetNameZip2Cities[key2] = streetNameZip2Cities[key2].Distinct().ToList(); } if (zipRight != 0 && !string.IsNullOrEmpty(cityRight)) { StreetNameAndCity key1 = new StreetNameAndCity { City = cityRight, FullStreetName = fullStreetName, }; if (!streetNameCity2Zips.ContainsKey(key1)) { streetNameCity2Zips.Add(key1, new List <int>()); } streetNameCity2Zips[key1].Add(zipRight); streetNameCity2Zips[key1] = streetNameCity2Zips[key1].Distinct().ToList(); StreetNameAndZip key2 = new StreetNameAndZip { FullStreetName = fullStreetName, Zip = zipRight, }; if (!streetNameZip2Cities.ContainsKey(key2)) { streetNameZip2Cities.Add(key2, new List <string>()); } streetNameZip2Cities[key2].Add(cityRight); streetNameZip2Cities[key2] = streetNameZip2Cities[key2].Distinct().ToList(); } } lock (zipCodes) { if (!zipCodes.ContainsKey(name)) { zipCodes.Add(name, new List <int>()); } if (zipLeft != 0) { zipCodes[name].Add(zipLeft); } if (zipRight != 0 && zipLeft != zipRight) { zipCodes[name].Add(zipRight); } } lock (cities) { if (!cities.ContainsKey(name)) { cities.Add(name, new List <string>()); } if (!string.IsNullOrEmpty(cityLeft)) { cities[name].Add(cityLeft); } if (!string.IsNullOrEmpty(cityRight) && cityRight != cityLeft) { cities[name].Add(cityRight); } } } }); List <StreetName> allStreetNames = new List <StreetName>(); StreetName[] keys = zipCodes.Keys.ToArray(); foreach (StreetName key in keys) { StreetName newStreetName = new StreetName(key.PreDirection, key.PreType, key.Name, key.Suffix, zipCodes[key].Distinct().ToList(), cities[key].Distinct().ToList()); allStreetNames.Add(newStreetName); } BinaryFormatter bf = new BinaryFormatter(); using (FileStream sw = File.Create("c:/users/brush/desktop/streetNames.dat")) { bf.Serialize(sw, allStreetNames); } string[] uniqueCities = allStreetNames.SelectMany(n => n.Cities).Distinct().ToArray(); File.WriteAllLines("C:/users/brush/desktop/knownCities.csv", uniqueCities); string[] uniqueStreets = allStreetNames.Select(n => n.Name).Distinct().ToArray(); File.WriteAllLines("C:/users/brush/desktop/knownStreets.csv", uniqueStreets); BKTree citiesTree = BKTreeEngine.CreateBKTree(uniqueCities.ToList()); BKTreeSerializer.SerializeTo(citiesTree, "c:/users/brush/desktop/citiesBKTree.dat"); BKTree streetsTree = BKTreeEngine.CreateBKTree(uniqueStreets.ToList()); BKTreeSerializer.SerializeTo(streetsTree, "c:/users/brush/desktop/streetsBKTree.dat"); bf = new BinaryFormatter(); using (FileStream fw = File.Create("C:/users/brush/desktop/streetNameCity2Zips.dat")) { bf.Serialize(fw, streetNameCity2Zips); } bf = new BinaryFormatter(); using (FileStream fw = File.Create("C:/users/brush/desktop/streetNameZip2Cities.dat")) { bf.Serialize(fw, streetNameZip2Cities); } return(allStreetNames); }
public static Data LoadData(bool regenerateBKTree) { Data data = new Data(); // RawData data.FinalDataSet = FileLibrary.GetLines().Skip(1).Where(l => l != ",,,,,,,,,,,,,,,,,,").ToArray(); // Suffixes data.Suffixes = new AddressSuffixes(); string[] streetSuffixLines = File.ReadAllLines(StreetSuffixesPath); data.Suffixes.ShortSuffixes = streetSuffixLines.Select(n => n.Split(',')[1]).ToArray(); data.Suffixes.LongSuffixes = streetSuffixLines.Select(n => n.Split(',')[0]).ToArray(); // Unknown and Homeless data.UnknownAddresses = File.ReadAllLines("UnknownAddresses.csv"); data.HomelessAddresses = File.ReadAllLines("HomelessAddresses.csv"); // Abbreviations data.Abbreviations = new Dictionary <string, string>(); string[] nameValuePairs = File.ReadAllLines("Abbreviations.txt"); foreach (string nameValuePair in nameValuePairs) { string[] bits = nameValuePair.Split(',').Select(n => n.Trim()).ToArray(); data.Abbreviations.Add(bits[0], bits[1]); } data.AbbreviationsShortened = new Dictionary <string, string>(); nameValuePairs = File.ReadAllLines("AbbreviationsShortened.txt"); foreach (string nameValuePair in nameValuePairs) { string[] bits = nameValuePair.Split(',').Select(n => n.Trim()).ToArray(); data.AbbreviationsShortened.Add(bits[0], bits[1]); } // SuffixReplacementKey nameValuePairs = File.ReadAllLines("SuffixReplacementKey.txt"); data.SuffixReplacementKey = new Dictionary <string, string>(); foreach (string nameValuePair in nameValuePairs) { string[] bits = nameValuePair.Split(',').Select(n => n.Trim()).ToArray(); if (!data.SuffixReplacementKey.ContainsKey(bits[0])) { data.SuffixReplacementKey.Add(bits[0], bits[1]); } } // KnownCenters nameValuePairs = File.ReadAllLines("KnownCenters.txt"); data.KnownCenters = new Dictionary <string, Address>(); foreach (string nameValuePair in nameValuePairs) { string[] bits = nameValuePair.Split(';').Select(n => n.Trim()).ToArray(); string[] rhsAddressParts = bits[1].Split(',').Select(n => n.Trim()).ToArray(); Address address = new Address { CenterName = rhsAddressParts[0], StreetNumber = rhsAddressParts[1], StreetName = rhsAddressParts[2], City = rhsAddressParts[3], State = rhsAddressParts[4], }; if (rhsAddressParts[5].Length > 0) { address.Zip = int.Parse(rhsAddressParts[5]); } address.FullStreetName = (address.StreetNumber != "" ? $"{address.StreetNumber} {address.StreetName}" : address.StreetName); data.KnownCenters.Add(bits[0], address); } // AlternateSuffixList string[] lines = File.ReadAllLines("streetToSuffixTable.txt"); data.AlternateSuffixList = new Dictionary <string, List <string> >(); foreach (string line in lines) { string[] halves = line.Split(':'); List <string> alternates = halves[1].Split(',').ToList(); data.AlternateSuffixList.Add(halves[0], alternates); } BinaryFormatter bf = new BinaryFormatter(); using (FileStream fin = File.OpenRead("streetNames.dat")) { data.StreetData = (List <StreetName>)bf.Deserialize(fin); } data.NYCityStreets = LoadNYCityAddresses(data); data.NYStateStreets = LoadNYStateStreets(data); // BKTree if (regenerateBKTree) { data.StreetNameBKTree = BKTreeEngine.CreateBKTree(data.NYStateStreets.ToList()); //BKTreeSerializer.SerializeTo(data.BKTree, "bkTree.dat"); } else { data.StreetNameBKTree = BKTreeSerializer.DeserializeFrom("bkTree.dat"); } data.CityNameBKTree = BKTreeSerializer.DeserializeFrom("citiesBKTree.dat"); data.KnownCities = new List <string>(File.ReadAllLines("knownCities.csv")); return(data); }