예제 #1
0
        static void BuildDictionary(string outputSerializedPath)
        {
            Data data = DataLoader.LoadData(regenerateBKTree: true);

            string[] stateWide = File.ReadAllLines(@"C:\Users\brush\Downloads\openaddr-collected-us_northeast\us\ny\statewide.csv").Select(n => n.ToUpper()).ToArray();
            string[] newYork   = File.ReadAllLines(@"D:\repos\MitchMatch\UndressAddress\city_of_new_york.csv").Select(n => n.ToUpper()).ToArray();

            List <string[]> allLines = new List <string[]>();

            allLines.AddRange(stateWide.Skip(1).Select(n => DecisionTreeLearner.Data.DataLoader.SmartSplit(n.ToUpper())));
            allLines.AddRange(newYork.Skip(1).Select(n => DecisionTreeLearner.Data.DataLoader.SmartSplit(n.ToUpper())));

            Dictionary <string, List <int> > street2Zip = new Dictionary <string, List <int> >();

            int count = 0;

            Parallel.ForEach(allLines, line =>
            {
                Interlocked.Increment(ref count);

                if (count % 100000 == 0)
                {
                    Console.WriteLine($"{count}/{allLines.Count}");
                }

                if (line.Length == 11)
                {
                    string streetName        = line[3];
                    string[] streetNameParts = streetName.Split(' ');
                    string possibleSuffix    = streetNameParts[streetNameParts.Length - 1];

                    for (int c = 0; c < data.Suffixes.ShortSuffixes.Length; c++)
                    {
                        if (data.Suffixes.ShortSuffixes[c] == possibleSuffix)
                        {
                            streetName = string.Join(" ", streetNameParts.Take(streetNameParts.Length - 1)) + " " +
                                         data.Suffixes.LongSuffixes[c];
                            break;
                        }
                    }

                    lock (street2Zip)
                    {
                        streetName = Regex.Replace(streetName, "( +)", " ");
                        streetName = Regex.Replace(streetName, "^N ", "NORTH ");
                        streetName = Regex.Replace(streetName, "^S ", "SOUTH ");
                        streetName = Regex.Replace(streetName, "^E ", "EAST ");
                        streetName = Regex.Replace(streetName, "^W ", "WEST ");
                        streetName = Regex.Replace(streetName, " N$", " NORTH");
                        streetName = Regex.Replace(streetName, " S$", " SOUTH");
                        streetName = Regex.Replace(streetName, " E$", " EAST");
                        streetName = Regex.Replace(streetName, " W$", " WEST");

                        streetName = Regex.Replace(streetName, " RD ", " ROAD ");
                        streetName = Regex.Replace(streetName, " AVE ", " AVENUE ");
                        streetName = Regex.Replace(streetName, "^AVE ", "AVENUE ");
                        streetName = Regex.Replace(streetName, "^BCH ", "BEACH ");
                        streetName = streetName.Replace("GRAND CONC", "GRAND CONCOURSE");

                        if (!string.IsNullOrEmpty(streetName))
                        {
                            if (!street2Zip.ContainsKey(streetName))
                            {
                                street2Zip.Add(streetName, new List <int>());
                            }

                            int zip = 0;
                            if (int.TryParse(line[8], out zip))
                            {
                                street2Zip[streetName].Add(zip);
                            }
                        }
                    }
                }
            });

            int numberWithNoZipCodes = 0;

            string[] keys = street2Zip.Keys.ToArray();
            for (int c = 0; c < keys.Length; c++)
            {
                street2Zip[keys[c]] = street2Zip[keys[c]].Distinct().ToList();

                if (street2Zip[keys[c]].Count == 0)
                {
                    numberWithNoZipCodes++;
                }
            }

            using (StreamWriter sw = File.CreateText("c:/users/brush/desktop/allStreets.csv"))
            {
                foreach (string key in keys)
                {
                    sw.WriteLine(key);
                }
            }

            Console.WriteLine("Number of streets: " + keys.Length.ToString());
            Console.WriteLine("Number with no zip codes: " + numberWithNoZipCodes.ToString());

            BinaryFormatter bf = new BinaryFormatter();

            using (FileStream fout = File.Create("c:/users/brush/desktop/farts.dat"))
            {
                bf.Serialize(fout, street2Zip);
            }
        }
예제 #2
0
        static List <string> GetCleanedNYStreetList2()
        {
            Console.WriteLine("Loading data...");
            //// read from all the necessary files
            //Data data = DataLoader.LoadData(regenerateBKTree: true);
            Data data = DataLoader.LoadDataBen(regenerateBKTree: true);

            //KeyValuePair<StreetNameAndCity, List<int>>[] rest = data.StreetNameCity2Zips.Where(n => n.Key.FullStreetName == "73 ST").ToArray();

            //data.FinalDataSet = data.FinalDataSet.Where(n => n.Contains("14423865")).Take(1).ToArray();

            //StreetName[] all = data.StreetData.Where(n => n.FullStreetName.Contains("56")).ToArray();

            //Random rand = new Random();
            //data.FinalDataSet = data.FinalDataSet.Where(b => rand.Next() % 10 == 0).ToArray();

            Console.WriteLine("Data loaded.");

            int numberCouldntProcess = 0;

            // counter variables.
            int iterations = 0;

            Address[] cleanedAddresses = new Address[data.FinalDataSet.Length];

            DateTime      lastTime  = DateTime.Now;
            List <double> timeSpans = new List <double>();

            // go over each line in the final data set.
            //for (int c = 0; c < data.FinalDataSet.Length; c++)
            Parallel.For(0, data.FinalDataSet.Length, c =>
            {
                #region DebuggingOutput
                // debugging purposes.
                Interlocked.Increment(ref iterations);
                if (iterations % 1000 == 0)
                {
                    DateTime now = DateTime.Now;
                    double millisecondsSinceLast = (now - lastTime).TotalMilliseconds;
                    timeSpans.Add(millisecondsSinceLast);

                    double averageTime        = timeSpans.Average();
                    double numberOfChecksLeft = (data.FinalDataSet.Length - iterations) / 1000.0f;

                    double hoursLeft = (averageTime * numberOfChecksLeft) / 1000.0f / 60.0f / 60.0f;

                    if (timeSpans.Count > 100)
                    {
                        timeSpans.RemoveAt(0);
                    }
                    int sum = (level1Match.Count + level2Match.Count +
                               level3Match.Count + level4Match.Count +
                               level5Match.Count + level6Match.Count +
                               level7Match.Count + level8Match.Count +
                               level9Match.Count + level10Match.Count +
                               homeless.Count + unknown.Count);

                    double percentage = (sum / ((iterations * 1.0))) * 100;

                    Console.Clear();
                    //Console.WriteLine();
                    //Console.WriteLine();
                    //Console.WriteLine();
                    Console.WriteLine($"Level 1 Match: {level1Match.Count}");
                    Console.WriteLine($"Level 2 Match: {level2Match.Count}");
                    Console.WriteLine($"Level 3 Match: {level3Match.Count}");
                    Console.WriteLine($"Level 4 Match: {level4Match.Count}");
                    Console.WriteLine($"Level 5 Match: {level5Match.Count}");
                    Console.WriteLine($"Level 6 Match: {level6Match.Count}");
                    Console.WriteLine($"Level 7 Match: {level7Match.Count}");
                    Console.WriteLine($"Level 8 Match: {level8Match.Count}");
                    Console.WriteLine($"Level 9 Match: {level9Match.Count}");
                    Console.WriteLine($"Level 10 Match: {level10Match.Count}");
                    Console.WriteLine($"Homeless or Unknown: {homeless.Count + unknown.Count}");
                    Console.WriteLine($"Failed: {failed.Count}");
                    Console.WriteLine("========SUMMARY=======");
                    Console.WriteLine($"{iterations}/{data.FinalDataSet.Length}: {percentage.ToString("0.00")}% matched. {hoursLeft.ToString("0.00")} hours left.");

                    lastTime = now;
                }
                #endregion

                Address address = LucasAddressMatch(data.FinalDataSet[c], data);
                //Address address = BenAddressMatch(data.FinalDataSet[c], data);

                cleanedAddresses[c] = address;
            });


            //using (StreamWriter fout = File.CreateText("perfectMatch.txt"))
            //{
            //    for (int c = 0; c < perfectMatch.Count; c++)
            //    {
            //        fout.WriteLine(perfectMatch[c]);
            //    }
            //}

            using (StreamWriter fout = File.CreateText("failed.txt"))
            {
                for (int c = 0; c < failed.Count; c++)
                {
                    fout.WriteLine(failed[c]);
                }
            }

            File.WriteAllLines("CleanedAddresses.csv", cleanedAddresses.Select(a => (a.StreetNumber != "" ? a.StreetNumber + " " + a.FullStreetName : a.FullStreetName)));

            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine($"Level 1 Match: {level1Match.Count}");
            Console.WriteLine($"Level 2 Match: {level2Match.Count}");
            Console.WriteLine($"Level 3 Match: {level3Match.Count}");
            Console.WriteLine($"Level 4 Match: {level4Match.Count}");
            Console.WriteLine($"Level 5 Match: {level5Match.Count}");
            Console.WriteLine($"Level 6 Match: {level6Match.Count}");
            Console.WriteLine($"Level 7 Match: {level7Match.Count}");
            Console.WriteLine($"Level 8 Match: {level8Match.Count}");
            Console.WriteLine($"Level 9 Match: {level9Match.Count}");
            Console.WriteLine($"Homeless or Unknown: {homeless.Count + unknown.Count}");
            Console.WriteLine($"Finished. {iterations}/{data.FinalDataSet.Length}.");

            Console.ReadLine();

            return(null);
        }