public static void LoadDBpediaRedirect() { lock (dbpediaRedirectLocker) { if (redirects == null) { var dic = new Dictionary <string, string>(); var dic2 = new Dictionary <string, string>(); var dic3 = new Dictionary <string, string>(); var reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.dbpedia_redirect_file)); var line = ""; System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"_+"); System.Text.RegularExpressions.Regex deleteBrace = new System.Text.RegularExpressions.Regex(@"\(\w+\)"); while ((line = reader.ReadLine()) != null) { line = line.ToLower(); var array = line.Split('\t'); var source = deleteBrace.Replace(array[0], ""); source = regex.Replace(source, " ").Trim(); var des = deleteBrace.Replace(array[1], ""); des = regex.Replace(des, " ").Trim(); dic[source] = des; var source2 = deleteSpace.Replace(source, ""); var des2 = deleteSpace.Replace(des, ""); dic2[source2] = des2; dic3[source2] = des; } reader.Close(); redirects = dic; redirectsWithoutSpace = dic2; redirectsWithoutSpace2WithSpace = dic3; } } }
/*Read Dictionary from file */ private static void LoadDictionary() { lock (dicLocker) { if (dicTypeMap == null) { FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.dic_file)); String line; List <String> list; dics = new Dictionary <string, List <string> >(); var dic = new Dictionary <String, int>(); HashSet <String> set = new HashSet <String>(); while ((line = reader.ReadLine()) != null) { list = line.Split('\t').ToList(); List <String> strs = list.GetRange(1, list.Count - 1); dics[list[0]] = strs; strs.ForEach(x => set.Add(x)); } foreach (var type in set) { dic[type] = dic.Count; } reader.Close(); dicTypeMap = dic; } } }
private static void LoadStemMap() { lock (stemmerLocker) { if (stemWordDic == null) { var dic = new Dictionary <string, string>(); FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.stem_map)); //FileReader reader = new LargeFileReader(@"D:\Codes\Project\EntityTyping\Fine-ner\input\tables\stem-word-table.txt"); string line; string[] array; while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); try { dic[array[0]] = array[1]; } catch (Exception) { continue; } } reader.Close(); stemWordDic = dic; } } }
private static void LoadWordClusterID() { lock (wordIDLocker) { if (wordIdDic == null) { var dic = new Dictionary <string, int>(); FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.word_id_file)); string line; string[] array; HashSet <int> ids = new HashSet <int>(); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); try { var id = int.Parse(array[1]); ids.Add(id); dic[array[0]] = id; } catch (Exception) { continue; } } reader.Close(); wordClusterSize = ids.Count; wordIdDic = dic; } } }
private static void LoadWordTable() { lock (wordTableLocker) { if (word2index == null) { FileReader reader = null; reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.word_table_file)); String line; var dic = new Dictionary <string, int>(); while ((line = reader.ReadLine()) != null) { var array = line.Split('\t'); try { var count = dic.Count; dic[array[0]] = count; } catch (Exception) { continue; } } reader.Close(); word2index = dic; } } }
/// <summary> /// Mention words are seperated by "_" /// </summary> private static void LoadMentionClusterID() { lock (mentionIDLocker) { if (mentionIdDic == null) { var dic = new Dictionary <string, int>(); FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.mention_id_file)); string line; string[] array; HashSet <int> ids = new HashSet <int>(); System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"_+"); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); try { var id = int.Parse(array[1]); ids.Add(id); array[0] = regex.Replace(array[0], " "); dic[array[0]] = id; } catch (Exception) { continue; } } reader.Close(); mentionClusterSize = ids.Count; mentionIdDic = dic; } } }
private static void LoadPosTagTable() { lock (posTagLocker) { if (posTag2index == null) { var dic = new Dictionary <string, int>(); FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.posTag_table_file)); String line; while ((line = reader.ReadLine()) != null) { try { var count = dic.Count; dic[line] = count; } catch (Exception) { continue; } } reader.Close(); posTag2index = dic; } } }
private void Initial(string modelDir = null) { var props = new Properties(); props.put("annotators", "tokenize"); props.setProperty("ner.useSUTime", "false"); var dir = Directory.GetCurrentDirectory(); Directory.SetCurrentDirectory((string)GlobalParameter.Get(DefaultParameter.Field.stanford_model_dir)); pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(dir); }
void Initial(string modelDir = null) { var props = new Properties(); props.put("annotators", "tokenize,ssplit"); props.put("tokenizer.whitespace", "true"); var dir = Directory.GetCurrentDirectory(); Directory.SetCurrentDirectory(modelDir ?? (string)GlobalParameter.Get(DefaultParameter.Field.stanford_model_dir)); pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(dir); }
void Initial() { var props = new Properties(); props.put("annotators", "tokenize,ssplit, pos,depparse"); props.setProperty("tokenizer.whitespace", "true"); props.setProperty("ssplit.isOneSentence", "true"); var dir = Directory.GetCurrentDirectory(); Directory.SetCurrentDirectory((string)GlobalParameter.Get(DefaultParameter.Field.stanford_model_dir)); pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(dir); }
/*********************Interactive Interface***********************/ /// <summary> /// /* Methods: /// /ewt : extract word table /// /ef : extract feature /// -b : extract feature for bayes model (default) /// -s : extract feature for svm model /// -all : extract all data feature (default) /// -train : extract train data feature /// -dev : extract develop data feature /// -test : extract test data feature /// /out /// -dt : output dictionary type and value /// /tr /// -b : train extracted feature with Bayes Model (default) /// /ts /// -b : test extracted features with Bayes Model (default) /// </summary> public void Execute() { string operation = null; // command var options = new HashSet <string>(); var method = (string)GlobalParameter.Get(DefaultParameter.Field.method); var array = Regex.Split(method, @"\s+"); for (var i = 0; i < array.Length; i++) { if (array[i].StartsWith("/")) { //execute one operaton if (operation != null) { Invoke(operation, options); } operation = array[i].Substring(1, array[i].Length - 1); // Encounter invalid operation if (!IsValidOperation(operation)) { Console.WriteLine(operation + " is not a valid operation!"); // skip invalid operation i++; while (i < array.Length && !array[i].StartsWith("/")) { i++; } i--; } } else if (array[i].StartsWith("-")) { var option = array[i].Substring(1, array[i].Length - 1); // Check if option is valid if (IsValidOption(operation, option)) { options.Add(option); } else { Console.Error.WriteLine(option + " is invalid for operation:" + operation); } } } // Invoke the last operation if (operation != null) { Invoke(operation, options); } }
/* Train file format: * Mention Type Context * Extract word table and word shape table from train data * Every word is converted to lowercase and stemmed * /************************************************************************/ public void ExtractWordTable() { FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file)); FileWriter writer = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_table_file), FileMode.Create); FileWriter wordShapeWriter = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_shape_table_file), FileMode.Create); //FileWriter wordShapeWriter = new LargeFileWriter("../../../Fine-ner/input/shape-table-file.txt", FileMode.Create); string line = null; var wordTable = new HashSet <string>(); var wordShapeTable = new HashSet <string>(); while ((line = reader.ReadLine()) != null) { try { var array = line.Split('\t'); var tokenizer = TokenizerPool.GetTokenizer(); var words = tokenizer.Tokenize(array[2]); TokenizerPool.ReturnTokenizer(tokenizer); foreach (var w in words) { if (!string.IsNullOrEmpty(w)) // w should not be empty { var shape = Feature.GetWordShape(w); if (!wordShapeTable.Contains(shape)) { wordShapeWriter.WriteLine(shape); wordShapeTable.Add(shape); } var word = Generalizer.Generalize(w); if (!wordTable.Contains(word)) { writer.WriteLine(word); wordTable.Add(word); } } } } catch (Exception e) { Console.WriteLine("=================error!==============="); Console.WriteLine("\t" + e.Message); Console.WriteLine("\t" + e.StackTrace); Console.WriteLine("=================error!==============="); continue; } } reader.Close(); writer.Close(); }
private void Initial() { var basedir = (string)GlobalParameter.Get(DefaultParameter.Field.opennlp_model_dir); var modelInputStream = new java.io.FileInputStream(Path.Combine(basedir, "en-ner-location.bin")); //load the name model into a stream var model = new opennlp.tools.namefind.TokenNameFinderModel(modelInputStream); //load the model locationNameFinder = new opennlp.tools.namefind.NameFinderME(model); //create the namefinder modelInputStream = new java.io.FileInputStream(Path.Combine(basedir, "en-ner-person.bin")); model = new opennlp.tools.namefind.TokenNameFinderModel(modelInputStream); personNameFinder = new opennlp.tools.namefind.NameFinderME(model); modelInputStream = new java.io.FileInputStream(Path.Combine(basedir, "en-ner-organization.bin")); model = new opennlp.tools.namefind.TokenNameFinderModel(modelInputStream); organizationNameFinder = new opennlp.tools.namefind.NameFinderME(model); }
void Initial() { // Create StanfordCoreNLP object properties, with POS tagging // (required for lemmatization), and lemmatization Properties props; props = new Properties(); props.put("annotators", "tokenize, ssplit, pos,lemma"); props.setProperty("tokenizer.whitespace", "true"); props.setProperty("ssplit.eolonly", "true"); var dir = Directory.GetCurrentDirectory(); props.setProperty("ner.useSUTime", "false"); //Directory.SetCurrentDirectory(@"E:\Users\v-mipeng\Software Install\Stanford NLP\stanford-corenlp-full-2015-04-20\"); Directory.SetCurrentDirectory((string)GlobalParameter.Get(DefaultParameter.Field.stanford_model_dir)); pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(dir); }
private void OutputDicTypeValue() { var dic = DataCenter.GetDicTyeMap(); var writer = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.dic_type_value_file), FileMode.OpenOrCreate); foreach (var key in dic.Keys) { if (GlobalParameter.featureNum != 0) { writer.WriteLine(key + "\t" + (GlobalParameter.featureNum - DataCenter.GetDicTypeNum() + dic[key])); } else { writer.WriteLine(key + "\t" + dic[key]); } } writer.Close(); }
void Initial() { // Create StanfordCoreNLP object properties, with POS tagging // (required for lemmatization), and lemmatization Properties props; props = new Properties(); props.put("annotators", "tokenize, ssplit, pos,lemma, ner"); props.setProperty("tokenizer.whitespace", "true"); props.setProperty("ssplit.eolonly", "true"); props.setProperty("ner.useSUTime", "0"); //props.setProperty("ner.model", @"D:\Codes\C#\EntityTyping\Fine-ner\input\stanford models\edu\stanford\nlp\models\ner\english.all.3class.distsim.crf.ser.gz"); var dir = Directory.GetCurrentDirectory(); Directory.SetCurrentDirectory((string)GlobalParameter.Get(DefaultParameter.Field.stanford_model_dir)); pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(dir); }
public static void LoadDBpedia() { lock (dbpediaDicLocker) { if (dbpediaEntity2Type == null) { var dic = new Dictionary <string, object>(); object types = null; var reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.dbpedia_dic_file)); var line = ""; System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"_+"); System.Text.RegularExpressions.Regex deleteBrace = new System.Text.RegularExpressions.Regex(@"\(\w+\)"); while ((line = reader.ReadLine()) != null) { line = line.ToLower(); var array = line.Split('\t'); var entity = deleteBrace.Replace(array[0], ""); entity = regex.Replace(entity, "").Trim(); // does not contains space if (dic.TryGetValue(entity, out types)) { if (types.GetType().Equals(typeof(string))) { var set = new HashSet <string>(); set.Add((string)types); set.Add(array[1]); dic[entity] = set; } else { ((HashSet <string>)types).Add(array[1]); } } else { dic[entity] = array[1]; } } reader.Close(); dbpediaEntity2Type = dic; } } }
/*Read name list from file */ private static void LoadNameSet() { fullNameSet = new HashSet <string>(); partNameSet = new HashSet <string>(); FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.name_list_file)); String line; String[] array; while ((line = reader.ReadLine()) != null) { array = line.Split(' '); fullNameSet.Add(line); foreach (var x in array) { partNameSet.Add(x); } } reader.Close(); }
private static void LoadPageAnchors() { lock (pageAnchorLocker) { if (pageAnchorsDic == null) { var reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.page_anchor_file)); var line = ""; var dic = new Dictionary <string, List <string> >(); while ((line = reader.ReadLine()) != null) { var array = line.Split('\t'); var list = array.ToList(); dic[array[0]] = list; // Depend on file format list.RemoveAt(0); } reader.Close(); pageAnchorsDic = dic; } } }
private void Train(HashSet <string> options) { if (options == null) { options = new HashSet <string>(new string [] { "b" }); } // train with bayes model if (options.Contains("b")) { var trainer = new BayesModel((string)GlobalParameter.Get(DefaultParameter.Field.train_feature_file), (string)GlobalParameter.Get(DefaultParameter.Field.model_file)); try { trainer.Train(); } catch (Exception e) { Console.WriteLine("Error occur during train for " + e.Message); throw new Exception(); } } }
private void Test(HashSet <string> options) { if (options == null) { options = new HashSet <string>(new string[] { "b" }); } // test with bayes model if (options.Contains("b")) { var tester = new BayesTest((string)GlobalParameter.Get(DefaultParameter.Field.model_file), (string)GlobalParameter.Get(DefaultParameter.Field.develop_feature_file), (string)GlobalParameter.Get(DefaultParameter.Field.test_result_file)); try { tester.Test(); } catch (Exception e) { Console.WriteLine("Error occurs during test for " + e.Message); throw new Exception(); } } }
private static void LoadDisambiguous() { lock (disambiguousLocker) { if (disambiguousDic == null) { var dic = new Dictionary <string, List <string> >(); var reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.disambiguous_file)); var line = ""; System.Text.RegularExpressions.Regex deleteUnderline = new System.Text.RegularExpressions.Regex(@"_+"); while ((line = reader.ReadLine()) != null) { var l = deleteUnderline.Replace(line, ""); var array = line.Split('\t').ToList(); dic[array[0]] = array; array.RemoveAt(0); } reader.Close(); disambiguousDic = dic; } } }
private static void LoadKeyWords() { lock (keyWordLocker) { if (keyWords == null) { var reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.keyword_file)); var line = ""; var dic = new Dictionary <string, int>(); var token = ""; while ((line = reader.ReadLine()) != null) { if (!dic.ContainsKey((token = line.Trim()))) { dic[token] = dic.Count; } } reader.Close(); dic["NONE"] = dic.Count; keyWords = dic; } } }
/// <summary> /// Extract feature /// </summary> /// <param name="options"> /// b : extract feature for bayes model (default) /// s : extract feature for svm model /// all : extract all data feature (default) /// train : extract train data feature /// dev : extract develop data feature /// test : extract test data feature /// </param> private void ExtractFeature(HashSet <string> options) { if (options == null) { // set default options options = new HashSet <string>(new string[] { "bayes", "all" }); } if (options.Contains("bayes")) { // extract features for bayes model if (options.Contains("train") || options.Contains("all")) { ExtractBayesFeature((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.train_feature_file)); } if (options.Contains("dev") || options.Contains("all")) { ExtractBayesFeature((string)GlobalParameter.Get(DefaultParameter.Field.develop_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.develop_feature_file)); } if (options.Contains("test") || options.Contains("all")) { ExtractBayesFeature((string)GlobalParameter.Get(DefaultParameter.Field.test_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.test_feature_file)); } } else if (options.Contains("svm")) { // extract features for svm model if (options.Contains("train") || options.Contains("all")) { var extractor = new ParallelSVMFeatureExtractor((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.train_feature_file)); extractor.ExtractFeature(); } if (options.Contains("dev") || options.Contains("all")) { var extractor = new ParallelSVMFeatureExtractor((string)GlobalParameter.Get(DefaultParameter.Field.develop_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.develop_feature_file)); extractor.ExtractFeature(); } if (options.Contains("test") || options.Contains("all")) { var extractor = new ParallelSVMFeatureExtractor((string)GlobalParameter.Get(DefaultParameter.Field.test_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.test_feature_file)); extractor.ExtractFeature(); } } else if (options.Contains("me")) { // extract features for svm model if (options.Contains("train") || options.Contains("all")) { var extractor = new ParallelMaxEntFeatureExtractor((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.train_feature_file)); extractor.ExtractFeature(); } if (options.Contains("dev") || options.Contains("all")) { var extractor = new ParallelMaxEntFeatureExtractor((string)GlobalParameter.Get(DefaultParameter.Field.develop_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.develop_feature_file)); extractor.ExtractFeature(); } if (options.Contains("test") || options.Contains("all")) { var extractor = new ParallelMaxEntFeatureExtractor((string)GlobalParameter.Get(DefaultParameter.Field.test_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.test_feature_file)); extractor.ExtractFeature(); } } else if (options.Contains("raw")) { // extract raw features if (options.Contains("train") || options.Contains("all")) { var extractor = new ParallelIndividualFeatureExtractor((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.train_feature_file)); if (options.Contains("add")) { extractor.AddFeature(); } else { extractor.ExtractFeature(); } } if (options.Contains("dev") || options.Contains("all")) { var extractor = new ParallelIndividualFeatureExtractor((string)GlobalParameter.Get(DefaultParameter.Field.develop_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.develop_feature_file)); if (options.Contains("add")) { extractor.AddFeature(); } else { extractor.ExtractFeature(); } } if (options.Contains("test") || options.Contains("all")) { var extractor = new ParallelIndividualFeatureExtractor((string)GlobalParameter.Get(DefaultParameter.Field.test_data_file), (string)GlobalParameter.Get(DefaultParameter.Field.test_feature_file)); if (options.Contains("add")) { extractor.AddFeature(); } else { extractor.ExtractFeature(); } } } }