public FullAddress(string addrPath, ref NERAddress ner) { this.addrPath = addrPath; this.ner = ner; this.MIN_SCORE = 0.90; this.MAX_DIST = 2; }
static void Main() { Paths paths = new Paths(); // init locality database DataBase locality = new DataBase(paths.path.localitiesPath, "county_id", "locality_id", "name"); // init secondary locality database DataBase secLocality = new DataBase(paths.path.secLocalityPath, "county_id", "locality_id", "name"); // init thorofare database DataBase thorofare = new DataBase(paths.path.thorofarePath, "county_id", "thorfare_id", "thorfare_name"); // train data NERTrain train = new NERTrain(ref locality, ref secLocality, ref thorofare); try { File.OpenRead(paths.path.trainPath); } catch (IOException e) { Console.WriteLine("Generating training data..."); train.createTrainingData(paths.path.fullAddressesPath, paths.path.trainPath); Console.WriteLine("Creating model..."); train.createModelFromTrainingData(paths.path.trainPath, paths.path.modelPath, paths.path.properties); } /* * var t = new F23.StringSimilarity.Damerau(); * Console.WriteLine(t.Distance("", "12windmillpark")); * Console.WriteLine(Fuzz.PartialRatio("woodgreen", "wood")); * return; */ // use model /* * //string user_search_test = "6Woodlands Avenue, Dromahair"; * string user_search_test = ""; * string county_test = "leitrim"; * NERAddress address_test = new NERAddress(); * string[] user_search_test_normalized = address_test.normalize(user_search_test); * * // print input address * Console.WriteLine("Input address: " + user_search_test); * // print normalized address * Console.WriteLine("Normalized address[tokens]: " + string.Join("|", user_search_test_normalized)); * train.useModel(paths.path.modelPath, user_search_test.ToLower(), ref address_test); * * FullAddress addr_test = new FullAddress(paths.path.fullAddressesPath, ref address_test); * List<FullAddressFields> best_addr_test = addr_test.getBestAddresses(county_test, user_search_test_normalized); * foreach (FullAddressFields best_addr_i in best_addr_test) * { * Console.WriteLine(best_addr_i.address, Console.ForegroundColor = ConsoleColor.Green); * Console.ResetColor(); * } * * return; */ foreach (string partial_file in Directory.EnumerateFiles(paths.path.partialAddresses, "*.csv")) { TextFieldParser parser = new TextFieldParser(partial_file); parser.TextFieldType = FieldType.Delimited; parser.SetDelimiters(","); string[] fields = parser.ReadFields(); int address_index = 1; int county_index = 3; for (int field_index = 0; field_index < fields.Length; field_index++) { if (string.Equals(fields[field_index], "address", StringComparison.OrdinalIgnoreCase)) { address_index = field_index; } else if (string.Equals(fields[field_index], "county", StringComparison.OrdinalIgnoreCase)) { county_index = field_index; } } Stats statistics = new Stats(); int doc_id = 0; while (!parser.EndOfData) { doc_id += 1; Console.Write("\n"); fields = parser.ReadFields(); string user_search = fields.GetValueAt <string>(address_index); string county = fields.GetValueAt <string>(county_index); // init elastic Elastic obj = new Elastic(paths.path.indexPath); var json = obj.ConvertCsvFileToJsonBulkList(partial_file); // docs indexed starting from id 1 obj.SendJsonToElastic(json); // print input address Console.WriteLine("Input address: " + user_search); // set processing flag in elastic obj.updateDocument(doc_id.ToString(), @"{""doc"": {""flag"": ""processing""}}"); Console.WriteLine("Document in elasticsearch: " + obj.QueryElasticById(doc_id.ToString())); // begin searching Console.WriteLine("Searching...", Console.ForegroundColor = ConsoleColor.Green); Console.ResetColor(); // init model NERAddress address = new NERAddress(); // normalize data string[] user_search_normalized = address.normalize(user_search); // print what model found train.useModel(paths.path.modelPath, user_search.ToLower(), ref address); // print normalized address Console.WriteLine("Normalized address[tokens]: " + string.Join("|", user_search_normalized)); // init FullAddress FullAddress addr = new FullAddress(paths.path.fullAddressesPath, ref address); // set flag to processed in elastic obj.updateDocument(doc_id.ToString(), @"{""doc"": {""flag"": ""processed""}}"); // for maybe matches index nested objects List <FullAddressFields> best_addr = addr.getBestAddresses(county, user_search_normalized); if (best_addr.Count == 0) { Console.WriteLine("No matches found"); statistics.no_matches += 1; } else if (best_addr.Count == 1) { obj.updateDocument(doc_id.ToString(), @"{""doc"":" + JsonConvert.SerializeObject(best_addr[0]) + "}"); statistics.exact_matches += 1; } else { List <string> json_objects = new List <string>(); foreach (FullAddressFields best_addr_i in best_addr) { json_objects.Add(JsonConvert.SerializeObject(best_addr_i)); } string nested_json = "[" + String.Join(",", json_objects) + "]"; obj.updateDocument(doc_id.ToString(), @"{""doc"": {""matches"": " + nested_json + "}}"); statistics.maybe_matches += 1; } Console.WriteLine("Document in elasticsearch: " + obj.QueryElasticById(doc_id.ToString())); // Output only addresses foreach (FullAddressFields best_addr_i in best_addr) { Console.WriteLine(best_addr_i.address, Console.ForegroundColor = ConsoleColor.Green); Console.ResetColor(); } //break; } // print statistics Console.WriteLine("Exact matches found: " + statistics.exact_matches); Console.WriteLine("One to many matches found: " + statistics.maybe_matches); Console.WriteLine("No matches found: " + statistics.no_matches); } }
public void useModel(string inputPath, string partial_address, ref NERAddress addr) { CRFClassifier model = CRFClassifier.getClassifierNoExceptions(inputPath); //string tagged_address = model.classifyToString(partial_address); string tagged_address = model.classifyWithInlineXML(partial_address); tagged_address = tagged_address.Replace("<0>", "<ZERO>"); tagged_address = tagged_address.Replace("</0>", "</ZERO>"); // parse xml XmlDocument doc = new XmlDocument(); try { doc.LoadXml("<root>" + tagged_address + "</root>"); } catch (XmlException e) { Console.WriteLine("Exception occurred while parsing xml: " + e.Message); return; } Console.WriteLine("Model output: " + tagged_address); string numbers = ""; foreach (XmlNode node in doc.DocumentElement.ChildNodes) { if (Regex.IsMatch(node.Name, "^[IOB]-LOCALITY")) { addr.locality += node.InnerText + " "; } if (Regex.IsMatch(node.Name, "^[IOB]-SECONDARY_LOCALITY")) { addr.secondary_locality += node.InnerText + " "; } if (Regex.IsMatch(node.Name, "^[IOB]-THOROFARE")) { addr.thorofare += node.InnerText + " "; } if (Regex.IsMatch(node.Name, "^[IOB]-BUILDING_GROUP_NAME")) { addr.building_group_name += node.InnerText + " "; } if (Regex.IsMatch(node.Name, "^[IOB]-BUILDING_NAME")) { addr.building_name += node.InnerText + " "; } if (Regex.IsMatch(node.Name, "^[IOB]-SUB_BUILDING_NAME")) { addr.sub_building_name += node.InnerText + " "; } if (Regex.IsMatch(node.Name, "^[IOB]-BUILDING_NUMBER")) { addr.building_number += node.InnerText + " "; } if (Regex.IsMatch(node.Name, "^[IOB]-DEPARTMENT")) { addr.department += node.InnerText + " "; } if (Regex.IsMatch(node.Name, "^[IOB]-ORGANISATION_NAME")) { addr.organisation_name += node.InnerText + " "; } if (node.Name == "NUMBER") { numbers += node.InnerText + " "; } } addr.numbers = numbers.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); addr.locality = addr.locality.Trim(); addr.secondary_locality = addr.secondary_locality.Trim(); addr.thorofare = addr.thorofare.Trim(); addr.building_group_name = addr.building_group_name.Trim(); addr.building_name = addr.building_name.Trim(); addr.sub_building_name = addr.sub_building_name.Trim(); addr.building_number = addr.building_number.Trim(); addr.department = addr.department.Trim(); addr.organisation_name = addr.organisation_name.Trim(); }