static void BuildDictionary(string outputSerializedPath) { Data data = DataLoader.LoadData(regenerateBKTree: true); string[] stateWide = File.ReadAllLines(@"C:\Users\brush\Downloads\openaddr-collected-us_northeast\us\ny\statewide.csv").Select(n => n.ToUpper()).ToArray(); string[] newYork = File.ReadAllLines(@"D:\repos\MitchMatch\UndressAddress\city_of_new_york.csv").Select(n => n.ToUpper()).ToArray(); List <string[]> allLines = new List <string[]>(); allLines.AddRange(stateWide.Skip(1).Select(n => DecisionTreeLearner.Data.DataLoader.SmartSplit(n.ToUpper()))); allLines.AddRange(newYork.Skip(1).Select(n => DecisionTreeLearner.Data.DataLoader.SmartSplit(n.ToUpper()))); Dictionary <string, List <int> > street2Zip = new Dictionary <string, List <int> >(); int count = 0; Parallel.ForEach(allLines, line => { Interlocked.Increment(ref count); if (count % 100000 == 0) { Console.WriteLine($"{count}/{allLines.Count}"); } if (line.Length == 11) { string streetName = line[3]; string[] streetNameParts = streetName.Split(' '); string possibleSuffix = streetNameParts[streetNameParts.Length - 1]; for (int c = 0; c < data.Suffixes.ShortSuffixes.Length; c++) { if (data.Suffixes.ShortSuffixes[c] == possibleSuffix) { streetName = string.Join(" ", streetNameParts.Take(streetNameParts.Length - 1)) + " " + data.Suffixes.LongSuffixes[c]; break; } } lock (street2Zip) { streetName = Regex.Replace(streetName, "( +)", " "); streetName = Regex.Replace(streetName, "^N ", "NORTH "); streetName = Regex.Replace(streetName, "^S ", "SOUTH "); streetName = Regex.Replace(streetName, "^E ", "EAST "); streetName = Regex.Replace(streetName, "^W ", "WEST "); streetName = Regex.Replace(streetName, " N$", " NORTH"); streetName = Regex.Replace(streetName, " S$", " SOUTH"); streetName = Regex.Replace(streetName, " E$", " EAST"); streetName = Regex.Replace(streetName, " W$", " WEST"); streetName = Regex.Replace(streetName, " RD ", " ROAD "); streetName = Regex.Replace(streetName, " AVE ", " AVENUE "); streetName = Regex.Replace(streetName, "^AVE ", "AVENUE "); streetName = Regex.Replace(streetName, "^BCH ", "BEACH "); streetName = streetName.Replace("GRAND CONC", "GRAND CONCOURSE"); if (!string.IsNullOrEmpty(streetName)) { if (!street2Zip.ContainsKey(streetName)) { street2Zip.Add(streetName, new List <int>()); } int zip = 0; if (int.TryParse(line[8], out zip)) { street2Zip[streetName].Add(zip); } } } } }); int numberWithNoZipCodes = 0; string[] keys = street2Zip.Keys.ToArray(); for (int c = 0; c < keys.Length; c++) { street2Zip[keys[c]] = street2Zip[keys[c]].Distinct().ToList(); if (street2Zip[keys[c]].Count == 0) { numberWithNoZipCodes++; } } using (StreamWriter sw = File.CreateText("c:/users/brush/desktop/allStreets.csv")) { foreach (string key in keys) { sw.WriteLine(key); } } Console.WriteLine("Number of streets: " + keys.Length.ToString()); Console.WriteLine("Number with no zip codes: " + numberWithNoZipCodes.ToString()); BinaryFormatter bf = new BinaryFormatter(); using (FileStream fout = File.Create("c:/users/brush/desktop/farts.dat")) { bf.Serialize(fout, street2Zip); } }
static List <string> GetCleanedNYStreetList2() { Console.WriteLine("Loading data..."); //// read from all the necessary files //Data data = DataLoader.LoadData(regenerateBKTree: true); Data data = DataLoader.LoadDataBen(regenerateBKTree: true); //KeyValuePair<StreetNameAndCity, List<int>>[] rest = data.StreetNameCity2Zips.Where(n => n.Key.FullStreetName == "73 ST").ToArray(); //data.FinalDataSet = data.FinalDataSet.Where(n => n.Contains("14423865")).Take(1).ToArray(); //StreetName[] all = data.StreetData.Where(n => n.FullStreetName.Contains("56")).ToArray(); //Random rand = new Random(); //data.FinalDataSet = data.FinalDataSet.Where(b => rand.Next() % 10 == 0).ToArray(); Console.WriteLine("Data loaded."); int numberCouldntProcess = 0; // counter variables. int iterations = 0; Address[] cleanedAddresses = new Address[data.FinalDataSet.Length]; DateTime lastTime = DateTime.Now; List <double> timeSpans = new List <double>(); // go over each line in the final data set. //for (int c = 0; c < data.FinalDataSet.Length; c++) Parallel.For(0, data.FinalDataSet.Length, c => { #region DebuggingOutput // debugging purposes. Interlocked.Increment(ref iterations); if (iterations % 1000 == 0) { DateTime now = DateTime.Now; double millisecondsSinceLast = (now - lastTime).TotalMilliseconds; timeSpans.Add(millisecondsSinceLast); double averageTime = timeSpans.Average(); double numberOfChecksLeft = (data.FinalDataSet.Length - iterations) / 1000.0f; double hoursLeft = (averageTime * numberOfChecksLeft) / 1000.0f / 60.0f / 60.0f; if (timeSpans.Count > 100) { timeSpans.RemoveAt(0); } int sum = (level1Match.Count + level2Match.Count + level3Match.Count + level4Match.Count + level5Match.Count + level6Match.Count + level7Match.Count + level8Match.Count + level9Match.Count + level10Match.Count + homeless.Count + unknown.Count); double percentage = (sum / ((iterations * 1.0))) * 100; Console.Clear(); //Console.WriteLine(); //Console.WriteLine(); //Console.WriteLine(); Console.WriteLine($"Level 1 Match: {level1Match.Count}"); Console.WriteLine($"Level 2 Match: {level2Match.Count}"); Console.WriteLine($"Level 3 Match: {level3Match.Count}"); Console.WriteLine($"Level 4 Match: {level4Match.Count}"); Console.WriteLine($"Level 5 Match: {level5Match.Count}"); Console.WriteLine($"Level 6 Match: {level6Match.Count}"); Console.WriteLine($"Level 7 Match: {level7Match.Count}"); Console.WriteLine($"Level 8 Match: {level8Match.Count}"); Console.WriteLine($"Level 9 Match: {level9Match.Count}"); Console.WriteLine($"Level 10 Match: {level10Match.Count}"); Console.WriteLine($"Homeless or Unknown: {homeless.Count + unknown.Count}"); Console.WriteLine($"Failed: {failed.Count}"); Console.WriteLine("========SUMMARY======="); Console.WriteLine($"{iterations}/{data.FinalDataSet.Length}: {percentage.ToString("0.00")}% matched. {hoursLeft.ToString("0.00")} hours left."); lastTime = now; } #endregion Address address = LucasAddressMatch(data.FinalDataSet[c], data); //Address address = BenAddressMatch(data.FinalDataSet[c], data); cleanedAddresses[c] = address; }); //using (StreamWriter fout = File.CreateText("perfectMatch.txt")) //{ // for (int c = 0; c < perfectMatch.Count; c++) // { // fout.WriteLine(perfectMatch[c]); // } //} using (StreamWriter fout = File.CreateText("failed.txt")) { for (int c = 0; c < failed.Count; c++) { fout.WriteLine(failed[c]); } } File.WriteAllLines("CleanedAddresses.csv", cleanedAddresses.Select(a => (a.StreetNumber != "" ? a.StreetNumber + " " + a.FullStreetName : a.FullStreetName))); Console.WriteLine(); Console.WriteLine(); Console.WriteLine(); Console.WriteLine($"Level 1 Match: {level1Match.Count}"); Console.WriteLine($"Level 2 Match: {level2Match.Count}"); Console.WriteLine($"Level 3 Match: {level3Match.Count}"); Console.WriteLine($"Level 4 Match: {level4Match.Count}"); Console.WriteLine($"Level 5 Match: {level5Match.Count}"); Console.WriteLine($"Level 6 Match: {level6Match.Count}"); Console.WriteLine($"Level 7 Match: {level7Match.Count}"); Console.WriteLine($"Level 8 Match: {level8Match.Count}"); Console.WriteLine($"Level 9 Match: {level9Match.Count}"); Console.WriteLine($"Homeless or Unknown: {homeless.Count + unknown.Count}"); Console.WriteLine($"Finished. {iterations}/{data.FinalDataSet.Length}."); Console.ReadLine(); return(null); }