Ejemplo n.º 1
0
        public List <FullAddressFields> getBestAddresses(string user_county, string[] user_search_normalized)
        {
            TextFieldParser parser = new TextFieldParser(addrPath);

            parser.TextFieldType = FieldType.Delimited;
            parser.SetDelimiters(",");

            //Dictionary<int, string> header = new Dictionary<int, string>();
            string[] fields     = parser.ReadFields();
            int      fields_len = fields.Length;

            int county_index              = -1;
            int locality_index            = -1;
            int secLocality_index         = -1;
            int thorofare_index           = -1;
            int address_index             = -1;
            int building_id_index         = -1;
            int address_reference_index   = -1;
            int building_number_index     = -1;
            int building_group_name_index = -1;
            int building_name_index       = -1;
            int sub_building_name_index   = -1;
            int department_index          = -1;
            int organisation_name_index   = -1;

            for (int col_index = 0; col_index < fields_len; col_index++)
            {
                string value = fields.GetValueAt <string>(col_index);
                if (string.Equals(county, value, StringComparison.OrdinalIgnoreCase))
                {
                    county_index = col_index;
                }

                else if (string.Equals(locality, value, StringComparison.OrdinalIgnoreCase))
                {
                    locality_index = col_index;
                }

                else if (string.Equals(secLocality, value, StringComparison.OrdinalIgnoreCase))
                {
                    secLocality_index = col_index;
                }

                else if (string.Equals(thorofare, value, StringComparison.OrdinalIgnoreCase))
                {
                    thorofare_index = col_index;
                }

                else if (string.Equals(address, value, StringComparison.OrdinalIgnoreCase))
                {
                    address_index = col_index;
                }

                else if (string.Equals(building, value, StringComparison.OrdinalIgnoreCase))
                {
                    building_id_index = col_index;
                }

                else if (string.Equals(address_reference, value, StringComparison.OrdinalIgnoreCase))
                {
                    address_reference_index = col_index;
                }

                else if (string.Equals(building_number, value, StringComparison.OrdinalIgnoreCase))
                {
                    building_number_index = col_index;
                }

                else if (string.Equals(building_group_name, value, StringComparison.OrdinalIgnoreCase))
                {
                    building_group_name_index = col_index;
                }

                else if (string.Equals(building_name, value, StringComparison.OrdinalIgnoreCase))
                {
                    building_name_index = col_index;
                }

                else if (string.Equals(sub_building_name, value, StringComparison.OrdinalIgnoreCase))
                {
                    sub_building_name_index = col_index;
                }

                else if (string.Equals(department, value, StringComparison.OrdinalIgnoreCase))
                {
                    department_index = col_index;
                }

                else if (string.Equals(organisation_name, value, StringComparison.OrdinalIgnoreCase))
                {
                    organisation_name_index = col_index;
                }

                //header.Add(col_id, fields.GetValueAt<string>(col_id));
            }

            System.Collections.Generic.HashSet <FullAddressFields> top_addresses = new System.Collections.Generic.HashSet <FullAddressFields>();
            List <int> address_scores = new List <int>();

            while (!parser.EndOfData)
            {
                fields = parser.ReadFields();

                // build json using full address fields
                FullAddressFields fulladdr = new FullAddressFields();
                fulladdr.locality           = fields.GetValueAt <string>(locality_index);
                fulladdr.secondary_locality = fields.GetValueAt <string>(secLocality_index);
                fulladdr.thorofare          = fields.GetValueAt <string>(thorofare_index);
                fulladdr.building_number    = fields.GetValueAt <string>(building_number_index);
                fulladdr.county             = fields.GetValueAt <string>(county_index);
                fulladdr.address            = fields.GetValueAt <string>(address_index);
                fulladdr.building           = fields.GetValueAt <int>(building_id_index);
                fulladdr.address_reference  = fields.GetValueAt <long>(address_reference_index);

                fulladdr.building_group_name = fields.GetValueAt <string>(building_group_name_index);
                fulladdr.building_name       = fields.GetValueAt <string>(building_name_index);
                fulladdr.sub_building_name   = fields.GetValueAt <string>(sub_building_name_index);
                fulladdr.department          = fields.GetValueAt <string>(department_index);
                fulladdr.organisation_name   = fields.GetValueAt <string>(organisation_name_index);

                if (string.Equals(user_county, fulladdr.county, StringComparison.OrdinalIgnoreCase))
                {
                    string   address_split            = fulladdr.address.Replace(fulladdr.address.Split(",").Last(), "");
                    string[] address_split_normalized = ner.normalize(address_split);

                    if (!fulladdr.building_number.IsEmpty())
                    {
                        if (!ner.numbers.Contains <string>(fulladdr.building_number.ToLower().Trim()))
                        {
                            continue;
                        }
                    }
                    if (!fulladdr.sub_building_name.IsEmpty())
                    {
                        string sub_building_number = Regex.Replace(fulladdr.sub_building_name, @"(apartment)|(unit)|(flat)", "", RegexOptions.IgnoreCase).ToLower().Trim();
                        if (!ner.numbers.Contains <string>(sub_building_number))
                        {
                            continue;
                        }
                    }


                    int tokens_match = 0;
                    var sim          = new F23.StringSimilarity.JaroWinkler();
                    var dist         = new F23.StringSimilarity.Damerau();
                    foreach (string user_search_token in user_search_normalized)
                    {
                        foreach (string address_token in address_split_normalized)
                        {
                            //double score = sim.Similarity(user_search_token, address_token);
                            double score  = Fuzz.WeightedRatio(user_search_token, address_token, PreprocessMode.Full);
                            double score2 = dist.Distance(user_search_token, address_token);
                            if (score >= MIN_SCORE * 100 || score2 <= MAX_DIST)
                            //if (score >= MIN_SCORE && score2 <= MAX_DIST)
                            {
                                tokens_match += 1;
                                break;
                            }
                        }
                    }

                    if (user_search_normalized.Length == tokens_match)
                    {
                        top_addresses.Add(fulladdr);
                    }

                    //break;
                }
            }

            List <FullAddressFields> best_addresses = new List <FullAddressFields>();

            foreach (FullAddressFields addr in top_addresses)
            {
                if (!addr.building_number.IsEmpty() || !addr.sub_building_name.IsEmpty())
                {
                    best_addresses.Add(addr);
                }
            }

            if (best_addresses.Count == 0)
            {
                return(top_addresses.ToList <FullAddressFields>());
            }

            return(best_addresses);
        }
Ejemplo n.º 2
0
        static void Main()
        {
            Paths paths = new Paths();
            // init locality database
            DataBase locality = new DataBase(paths.path.localitiesPath, "county_id", "locality_id", "name");

            // init secondary locality database
            DataBase secLocality = new DataBase(paths.path.secLocalityPath, "county_id", "locality_id", "name");

            // init thorofare database
            DataBase thorofare = new DataBase(paths.path.thorofarePath, "county_id", "thorfare_id", "thorfare_name");

            // train data

            NERTrain train = new NERTrain(ref locality, ref secLocality, ref thorofare);

            try
            {
                File.OpenRead(paths.path.trainPath);
            }
            catch (IOException e)
            {
                Console.WriteLine("Generating training data...");
                train.createTrainingData(paths.path.fullAddressesPath, paths.path.trainPath);
                Console.WriteLine("Creating model...");
                train.createModelFromTrainingData(paths.path.trainPath, paths.path.modelPath, paths.path.properties);
            }

            /*
             * var t = new F23.StringSimilarity.Damerau();
             * Console.WriteLine(t.Distance("", "12windmillpark"));
             * Console.WriteLine(Fuzz.PartialRatio("woodgreen", "wood"));
             * return;
             */
            // use model

            /*
             * //string user_search_test = "6Woodlands Avenue, Dromahair";
             * string user_search_test = "";
             * string county_test = "leitrim";
             * NERAddress address_test = new NERAddress();
             * string[] user_search_test_normalized = address_test.normalize(user_search_test);
             *
             * // print input address
             * Console.WriteLine("Input address: " + user_search_test);
             * // print normalized address
             * Console.WriteLine("Normalized address[tokens]: " + string.Join("|", user_search_test_normalized));
             * train.useModel(paths.path.modelPath, user_search_test.ToLower(), ref address_test);
             *
             * FullAddress addr_test = new FullAddress(paths.path.fullAddressesPath, ref address_test);
             * List<FullAddressFields> best_addr_test = addr_test.getBestAddresses(county_test, user_search_test_normalized);
             * foreach (FullAddressFields best_addr_i in best_addr_test)
             * {
             *  Console.WriteLine(best_addr_i.address, Console.ForegroundColor = ConsoleColor.Green);
             *  Console.ResetColor();
             * }
             *
             * return;
             */



            foreach (string partial_file in Directory.EnumerateFiles(paths.path.partialAddresses, "*.csv"))
            {
                TextFieldParser parser = new TextFieldParser(partial_file);
                parser.TextFieldType = FieldType.Delimited;
                parser.SetDelimiters(",");
                string[] fields        = parser.ReadFields();
                int      address_index = 1;
                int      county_index  = 3;
                for (int field_index = 0; field_index < fields.Length; field_index++)
                {
                    if (string.Equals(fields[field_index], "address", StringComparison.OrdinalIgnoreCase))
                    {
                        address_index = field_index;
                    }
                    else if (string.Equals(fields[field_index], "county", StringComparison.OrdinalIgnoreCase))
                    {
                        county_index = field_index;
                    }
                }

                Stats statistics = new Stats();
                int   doc_id     = 0;
                while (!parser.EndOfData)
                {
                    doc_id += 1;
                    Console.Write("\n");
                    fields = parser.ReadFields();
                    string user_search = fields.GetValueAt <string>(address_index);
                    string county      = fields.GetValueAt <string>(county_index);

                    // init elastic
                    Elastic obj = new Elastic(paths.path.indexPath);

                    var json = obj.ConvertCsvFileToJsonBulkList(partial_file);
                    // docs indexed starting from id 1
                    obj.SendJsonToElastic(json);

                    // print input address
                    Console.WriteLine("Input address: " + user_search);

                    // set processing flag in elastic
                    obj.updateDocument(doc_id.ToString(), @"{""doc"": {""flag"": ""processing""}}");
                    Console.WriteLine("Document in elasticsearch: " + obj.QueryElasticById(doc_id.ToString()));

                    // begin searching
                    Console.WriteLine("Searching...", Console.ForegroundColor = ConsoleColor.Green);
                    Console.ResetColor();

                    // init model
                    NERAddress address = new NERAddress();
                    // normalize data
                    string[] user_search_normalized = address.normalize(user_search);

                    // print what model found
                    train.useModel(paths.path.modelPath, user_search.ToLower(), ref address);

                    // print normalized address
                    Console.WriteLine("Normalized address[tokens]: " + string.Join("|", user_search_normalized));

                    // init FullAddress
                    FullAddress addr = new FullAddress(paths.path.fullAddressesPath, ref address);
                    // set flag to processed in elastic
                    obj.updateDocument(doc_id.ToString(), @"{""doc"": {""flag"": ""processed""}}");

                    // for maybe matches index nested objects
                    List <FullAddressFields> best_addr = addr.getBestAddresses(county, user_search_normalized);
                    if (best_addr.Count == 0)
                    {
                        Console.WriteLine("No matches found");
                        statistics.no_matches += 1;
                    }
                    else if (best_addr.Count == 1)
                    {
                        obj.updateDocument(doc_id.ToString(), @"{""doc"":" + JsonConvert.SerializeObject(best_addr[0]) + "}");
                        statistics.exact_matches += 1;
                    }
                    else
                    {
                        List <string> json_objects = new List <string>();
                        foreach (FullAddressFields best_addr_i in best_addr)
                        {
                            json_objects.Add(JsonConvert.SerializeObject(best_addr_i));
                        }

                        string nested_json = "[" + String.Join(",", json_objects) + "]";
                        obj.updateDocument(doc_id.ToString(), @"{""doc"": {""matches"": " + nested_json + "}}");
                        statistics.maybe_matches += 1;
                    }
                    Console.WriteLine("Document in elasticsearch: " + obj.QueryElasticById(doc_id.ToString()));

                    // Output only addresses
                    foreach (FullAddressFields best_addr_i in best_addr)
                    {
                        Console.WriteLine(best_addr_i.address, Console.ForegroundColor = ConsoleColor.Green);
                        Console.ResetColor();
                    }

                    //break;
                }

                // print statistics
                Console.WriteLine("Exact matches found: " + statistics.exact_matches);
                Console.WriteLine("One to many matches found: " + statistics.maybe_matches);
                Console.WriteLine("No matches found: " + statistics.no_matches);
            }
        }