Esempio n. 1
0
 public FullAddress(string addrPath, ref NERAddress ner)
 {
     this.addrPath  = addrPath;
     this.ner       = ner;
     this.MIN_SCORE = 0.90;
     this.MAX_DIST  = 2;
 }
Esempio n. 2
0
        static void Main()
        {
            Paths paths = new Paths();
            // init locality database
            DataBase locality = new DataBase(paths.path.localitiesPath, "county_id", "locality_id", "name");

            // init secondary locality database
            DataBase secLocality = new DataBase(paths.path.secLocalityPath, "county_id", "locality_id", "name");

            // init thorofare database
            DataBase thorofare = new DataBase(paths.path.thorofarePath, "county_id", "thorfare_id", "thorfare_name");

            // train data

            NERTrain train = new NERTrain(ref locality, ref secLocality, ref thorofare);

            try
            {
                File.OpenRead(paths.path.trainPath);
            }
            catch (IOException e)
            {
                Console.WriteLine("Generating training data...");
                train.createTrainingData(paths.path.fullAddressesPath, paths.path.trainPath);
                Console.WriteLine("Creating model...");
                train.createModelFromTrainingData(paths.path.trainPath, paths.path.modelPath, paths.path.properties);
            }

            /*
             * var t = new F23.StringSimilarity.Damerau();
             * Console.WriteLine(t.Distance("", "12windmillpark"));
             * Console.WriteLine(Fuzz.PartialRatio("woodgreen", "wood"));
             * return;
             */
            // use model

            /*
             * //string user_search_test = "6Woodlands Avenue, Dromahair";
             * string user_search_test = "";
             * string county_test = "leitrim";
             * NERAddress address_test = new NERAddress();
             * string[] user_search_test_normalized = address_test.normalize(user_search_test);
             *
             * // print input address
             * Console.WriteLine("Input address: " + user_search_test);
             * // print normalized address
             * Console.WriteLine("Normalized address[tokens]: " + string.Join("|", user_search_test_normalized));
             * train.useModel(paths.path.modelPath, user_search_test.ToLower(), ref address_test);
             *
             * FullAddress addr_test = new FullAddress(paths.path.fullAddressesPath, ref address_test);
             * List<FullAddressFields> best_addr_test = addr_test.getBestAddresses(county_test, user_search_test_normalized);
             * foreach (FullAddressFields best_addr_i in best_addr_test)
             * {
             *  Console.WriteLine(best_addr_i.address, Console.ForegroundColor = ConsoleColor.Green);
             *  Console.ResetColor();
             * }
             *
             * return;
             */



            foreach (string partial_file in Directory.EnumerateFiles(paths.path.partialAddresses, "*.csv"))
            {
                TextFieldParser parser = new TextFieldParser(partial_file);
                parser.TextFieldType = FieldType.Delimited;
                parser.SetDelimiters(",");
                string[] fields        = parser.ReadFields();
                int      address_index = 1;
                int      county_index  = 3;
                for (int field_index = 0; field_index < fields.Length; field_index++)
                {
                    if (string.Equals(fields[field_index], "address", StringComparison.OrdinalIgnoreCase))
                    {
                        address_index = field_index;
                    }
                    else if (string.Equals(fields[field_index], "county", StringComparison.OrdinalIgnoreCase))
                    {
                        county_index = field_index;
                    }
                }

                Stats statistics = new Stats();
                int   doc_id     = 0;
                while (!parser.EndOfData)
                {
                    doc_id += 1;
                    Console.Write("\n");
                    fields = parser.ReadFields();
                    string user_search = fields.GetValueAt <string>(address_index);
                    string county      = fields.GetValueAt <string>(county_index);

                    // init elastic
                    Elastic obj = new Elastic(paths.path.indexPath);

                    var json = obj.ConvertCsvFileToJsonBulkList(partial_file);
                    // docs indexed starting from id 1
                    obj.SendJsonToElastic(json);

                    // print input address
                    Console.WriteLine("Input address: " + user_search);

                    // set processing flag in elastic
                    obj.updateDocument(doc_id.ToString(), @"{""doc"": {""flag"": ""processing""}}");
                    Console.WriteLine("Document in elasticsearch: " + obj.QueryElasticById(doc_id.ToString()));

                    // begin searching
                    Console.WriteLine("Searching...", Console.ForegroundColor = ConsoleColor.Green);
                    Console.ResetColor();

                    // init model
                    NERAddress address = new NERAddress();
                    // normalize data
                    string[] user_search_normalized = address.normalize(user_search);

                    // print what model found
                    train.useModel(paths.path.modelPath, user_search.ToLower(), ref address);

                    // print normalized address
                    Console.WriteLine("Normalized address[tokens]: " + string.Join("|", user_search_normalized));

                    // init FullAddress
                    FullAddress addr = new FullAddress(paths.path.fullAddressesPath, ref address);
                    // set flag to processed in elastic
                    obj.updateDocument(doc_id.ToString(), @"{""doc"": {""flag"": ""processed""}}");

                    // for maybe matches index nested objects
                    List <FullAddressFields> best_addr = addr.getBestAddresses(county, user_search_normalized);
                    if (best_addr.Count == 0)
                    {
                        Console.WriteLine("No matches found");
                        statistics.no_matches += 1;
                    }
                    else if (best_addr.Count == 1)
                    {
                        obj.updateDocument(doc_id.ToString(), @"{""doc"":" + JsonConvert.SerializeObject(best_addr[0]) + "}");
                        statistics.exact_matches += 1;
                    }
                    else
                    {
                        List <string> json_objects = new List <string>();
                        foreach (FullAddressFields best_addr_i in best_addr)
                        {
                            json_objects.Add(JsonConvert.SerializeObject(best_addr_i));
                        }

                        string nested_json = "[" + String.Join(",", json_objects) + "]";
                        obj.updateDocument(doc_id.ToString(), @"{""doc"": {""matches"": " + nested_json + "}}");
                        statistics.maybe_matches += 1;
                    }
                    Console.WriteLine("Document in elasticsearch: " + obj.QueryElasticById(doc_id.ToString()));

                    // Output only addresses
                    foreach (FullAddressFields best_addr_i in best_addr)
                    {
                        Console.WriteLine(best_addr_i.address, Console.ForegroundColor = ConsoleColor.Green);
                        Console.ResetColor();
                    }

                    //break;
                }

                // print statistics
                Console.WriteLine("Exact matches found: " + statistics.exact_matches);
                Console.WriteLine("One to many matches found: " + statistics.maybe_matches);
                Console.WriteLine("No matches found: " + statistics.no_matches);
            }
        }
Esempio n. 3
0
        public void useModel(string inputPath, string partial_address, ref NERAddress addr)
        {
            CRFClassifier model = CRFClassifier.getClassifierNoExceptions(inputPath);

            //string tagged_address = model.classifyToString(partial_address);
            string tagged_address = model.classifyWithInlineXML(partial_address);

            tagged_address = tagged_address.Replace("<0>", "<ZERO>");
            tagged_address = tagged_address.Replace("</0>", "</ZERO>");

            // parse xml
            XmlDocument doc = new XmlDocument();

            try
            {
                doc.LoadXml("<root>" + tagged_address + "</root>");
            }
            catch (XmlException e)
            {
                Console.WriteLine("Exception occurred while parsing xml: " + e.Message);
                return;
            }

            Console.WriteLine("Model output: " + tagged_address);

            string numbers = "";

            foreach (XmlNode node in doc.DocumentElement.ChildNodes)
            {
                if (Regex.IsMatch(node.Name, "^[IOB]-LOCALITY"))
                {
                    addr.locality += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-SECONDARY_LOCALITY"))
                {
                    addr.secondary_locality += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-THOROFARE"))
                {
                    addr.thorofare += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-BUILDING_GROUP_NAME"))
                {
                    addr.building_group_name += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-BUILDING_NAME"))
                {
                    addr.building_name += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-SUB_BUILDING_NAME"))
                {
                    addr.sub_building_name += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-BUILDING_NUMBER"))
                {
                    addr.building_number += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-DEPARTMENT"))
                {
                    addr.department += node.InnerText + " ";
                }

                if (Regex.IsMatch(node.Name, "^[IOB]-ORGANISATION_NAME"))
                {
                    addr.organisation_name += node.InnerText + " ";
                }

                if (node.Name == "NUMBER")
                {
                    numbers += node.InnerText + " ";
                }
            }

            addr.numbers = numbers.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);

            addr.locality            = addr.locality.Trim();
            addr.secondary_locality  = addr.secondary_locality.Trim();
            addr.thorofare           = addr.thorofare.Trim();
            addr.building_group_name = addr.building_group_name.Trim();
            addr.building_name       = addr.building_name.Trim();
            addr.sub_building_name   = addr.sub_building_name.Trim();
            addr.building_number     = addr.building_number.Trim();
            addr.department          = addr.department.Trim();
            addr.organisation_name   = addr.organisation_name.Trim();
        }