static void Temp() { var sourceDir = @"D:\Codes\Project\EntityTyping\Fine-ner\input\tmp\"; var des = @"D:\Codes\Project\EntityTyping\Fine-ner\input\keywords.txt"; var files = Directory.GetFiles(sourceDir); var reader = new LargeFileReader(); var writer = new LargeFileWriter(des, FileMode.Create); var line = ""; var keyWords = new HashSet <string>(); foreach (var file in files) { reader.Open(file); int count = 0; while ((line = reader.ReadLine()) != null) { count++; if (count > 100) { break; } var array = line.Split('\t'); keyWords.Add(array[0]); } } reader.Close(); foreach (var word in keyWords) { writer.WriteLine(word); } writer.Close(); }
public static void Temp5() { var sourceDir = @"D:\Codes\Project\EntityTyping\Fine-ner\input\feature\test"; var sourceFiles = Directory.GetFiles(sourceDir).ToList(); var desFile = @"D:\Codes\Project\EntityTyping\Fine-ner\input\feature\test data in dbpedia info.txt"; var writer = new LargeFileWriter(desFile, FileMode.Create); for (var i = 0; i < sourceFiles.Count; i++) { var reader = new EventReaderByLine(sourceFiles[i]); int count = 0; int total = 0; while (reader.HasNext()) { total++; var event1 = reader.GetNextEvent(); var rawFeature = event1.Feature.ToList(); if (!rawFeature[(int)Event.Field.dbpediaTypes].Equals("UNKNOW")) { count++; } } reader.Close(); writer.WriteLine(Path.GetFileNameWithoutExtension(sourceFiles[i]) + "\t" + count + "\t" + (1.0 * count / total)); } writer.Close(); }
public static void Temp4() { var source = @"D:\Codes\Project\EntityTyping\Fine-ner\input\dictionaries\dbpedia\dbpedia entity type.txt"; var reader = new pml.file.reader.LargeFileReader(source); var des = @"D:\Codes\Project\EntityTyping\Fine-ner\input\dictionaries\dbpedia\tmp.txt"; var writer = new LargeFileWriter(des, FileMode.Create); string line; int count = 0; var set = new HashSet <string>(); var dic = new Dictionary <string, int>(); var times = 0; while ((line = reader.ReadLine()) != null) { if (++count % 10000 == 0) { Console.WriteLine(count); } var array = line.Split('\t'); dic.TryGetValue(array[1], out times); dic[array[1]] = times + 1; } reader.Close(); foreach (var type in dic.OrderByDescending(key => key.Value)) { writer.WriteLine(type.Key + "\t" + type.Value); } writer.Close(); }
public static void RefreshStemDic(string des = null) { lock (stemmerLocker) { if (stemWordDic != null) { if (des == null) { des = (string)DefaultParameter.Get(DefaultParameter.Field.stem_map); //des = (string)GlobalParameter.Get(DefaultParameter.stem_map); } if (stemWordDic == null) { return; } FileWriter writer = new LargeFileWriter(des, FileMode.Create); foreach (var word in stemWordDic.Keys) { writer.WriteLine(word + "\t" + stemWordDic[word]); } writer.Close(); stemWordDic = null; } } }
private void OutputDicTypeValue() { var dic = DataCenter.GetDicTyeMap(); var writer = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.dic_type_value_file), FileMode.OpenOrCreate); foreach (var key in dic.Keys) { if (GlobalParameter.featureNum != 0) { writer.WriteLine(key + "\t" + (GlobalParameter.featureNum - DataCenter.GetDicTypeNum() + dic[key])); } else { writer.WriteLine(key + "\t" + dic[key]); } } writer.Close(); }
// save word and cluster id pairs private void SaveWordClusterId() { var writer = new LargeFileWriter(wordClusterIDFile, FileMode.Create); for (int i = 0; i < words.Count; i++) { writer.WriteLine(words[i] + "\t" + labels[i]); } writer.Close(); }
private void SaveWordTable() { var writer = new LargeFileWriter(wordTablePath, FileMode.Create); foreach (var word in wordTable.Keys) { writer.WriteLine(word + "\t" + wordTable[word]); } writer.Close(); }
private void SaveDf() { var writer = new LargeFileWriter(dfPath, FileMode.Create); foreach (var word in df.Keys) { writer.WriteLine(word + "\t" + df[word]); } writer.Close(); }
public static void GetItemNumByType() { var sourceDir = @"D:\Codes\Project\EntityTyping\Fine-ner\input\feature\test"; var desFile = @"D:\Codes\Project\EntityTyping\Fine-ner\input\feature\data info.txt"; var sourceFiles = Directory.GetFiles(sourceDir).ToList(); var writer = new LargeFileWriter(desFile, FileMode.Append); writer.WriteLine(sourceDir.Substring(sourceDir.LastIndexOf("\\") + 1) + ":"); for (var i = 0; i < sourceFiles.Count; i++) { var reader = new LargeFileReader(sourceFiles[i]); var line = ""; int count = 0; while ((line = reader.ReadLine()) != null) { count++; } reader.Close(); writer.WriteLine(Path.GetFileNameWithoutExtension(sourceFiles[i]) + "\t:\t" + count); } writer.Close(); }
/// <summary> /// Store the trained model into file /// </summary> /// <param name="desPath"></param> /// <param name="array"></param> /// <format> /// [Label] /// [field name] /// TAB [feature annotation,e.g., last word] TAB [times] /// TAB [feature annotation,e.g., last word] TAB [times] /// ###END### /// </format> internal static void OutputModel(string desPath, object model) { var writer = new LargeFileWriter(desPath, FileMode.Create); var dics = (Dictionary <string, Dictionary <string, Dictionary <string, int> > >)model; foreach (var label in dics.Keys) // Check ! { var dic = dics[label]; // fields-->dic<feature value, times> foreach (var field in dic.Keys) { writer.WriteLine(label); writer.WriteLine(field); // write field var featureDic = dic[field]; var keys = Feature.SortKeysByNum(featureDic); foreach (var featureValue in keys) { writer.WriteLine("\t" + featureValue + "\t" + featureDic[featureValue]); } } } writer.Close(); }
/* Train file format: * Mention Type Context * Extract word table and word shape table from train data * Every word is converted to lowercase and stemmed * /************************************************************************/ public void ExtractWordTable() { FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file)); FileWriter writer = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_table_file), FileMode.Create); FileWriter wordShapeWriter = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_shape_table_file), FileMode.Create); //FileWriter wordShapeWriter = new LargeFileWriter("../../../Fine-ner/input/shape-table-file.txt", FileMode.Create); string line = null; var wordTable = new HashSet <string>(); var wordShapeTable = new HashSet <string>(); while ((line = reader.ReadLine()) != null) { try { var array = line.Split('\t'); var tokenizer = TokenizerPool.GetTokenizer(); var words = tokenizer.Tokenize(array[2]); TokenizerPool.ReturnTokenizer(tokenizer); foreach (var w in words) { if (!string.IsNullOrEmpty(w)) // w should not be empty { var shape = Feature.GetWordShape(w); if (!wordShapeTable.Contains(shape)) { wordShapeWriter.WriteLine(shape); wordShapeTable.Add(shape); } var word = Generalizer.Generalize(w); if (!wordTable.Contains(word)) { writer.WriteLine(word); wordTable.Add(word); } } } } catch (Exception e) { Console.WriteLine("=================error!==============="); Console.WriteLine("\t" + e.Message); Console.WriteLine("\t" + e.StackTrace); Console.WriteLine("=================error!==============="); continue; } } reader.Close(); writer.Close(); }
public static void ExtractUIUC() { string source = @"E:\Users\v-mipeng\Data\Dictionary\name-list.freq.txt"; FileReader reader = new LargeFileReader(source); string des = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\data\name-list.txt"; FileWriter writer = new LargeFileWriter(des, FileMode.OpenOrCreate); String line; String[] array; while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); writer.WriteLine(array[0]); } }
private void SaveKeyWords(Dictionary <string, double> keyWordDic, string des) { var writer = new LargeFileWriter(des, FileMode.Create); int count = 0; foreach (var item in keyWordDic.OrderByDescending(key => key.Value)) { count++; writer.WriteLine(item.Key + "\t" + item.Value); if (count > 1000) { break; } } writer.Close(); }
/// <summary> /// Combine files given by sourceFiles into one file given by desFile /// </summary> /// <param name="sourceFiles"> /// Source file pathes to be combined /// </param> /// <param name="desFile"> /// The file path to store the combined file /// </param> public static void CombineFiles(IEnumerable <string> sourceFiles, string desFile) { var reader = new LargeFileReader(); var writer = new LargeFileWriter(desFile, FileMode.Create); string line; foreach (var file in sourceFiles) { reader.Open(file); while ((line = reader.ReadLine()) != null) { writer.WriteLine(line); } } reader.Close(); writer.Close(); }
/// <summary> /// Refine disambiguations file download from dbpedia /// </summary> /// <param name="sourceFile"></param> /// <param name="desFile"></param> public static void RefineAmbiguousItem(string sourceFile, string desFile) { var reader = new LargeFileReader(sourceFile); var writer = new LargeFileWriter(desFile, System.IO.FileMode.Create); var line = ""; System.Text.RegularExpressions.Regex sourceRegex = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>"); System.Text.RegularExpressions.Regex deleteBraceRegex = new System.Text.RegularExpressions.Regex(@"_?\([^\)]+\)"); System.Text.RegularExpressions.Regex desRegex = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>\s\.$"); var dic = new Dictionary <string, List <string> >(300000); List <string> list = null; reader.ReadLine(); while ((line = reader.ReadLine()) != null) { var sourceMatch = sourceRegex.Match(line); var source = sourceMatch.Groups[1].Value; source = deleteBraceRegex.Replace(source, ""); var desMatch = desRegex.Match(line); if (dic.TryGetValue(source, out list)) { list.Add(desMatch.Groups[1].Value); } else { list = new List <string>(); list.Add(desMatch.Groups[1].Value); dic[source] = list; } } reader.Close(); foreach (var item in dic) { writer.Write(item.Key); foreach (var des in item.Value) { writer.Write("\t" + des); } writer.WriteLine(""); } writer.Close(); }
public void Train() { this.statisticModel = BayesTest.LoadModel(@"D:\Codes\C#\EntityTyping\Fine-ner\unit test\output\model.txt"); this.developFeatures = LoadBayesData(@"D:\Codes\C#\EntityTyping\Fine-ner\unit test\output\developFeature.txt"); this.labels = GetLabels().ToList(); this.w = new Dictionary <string, double>(this.labels.Count); foreach (var label in labels) // Initial weight vector { w[label] = 1; } var learnSpeed = 0.005; var lastPositive = -1; this.positive = -1; int steps = 0; var lastWeight = new Dictionary <string, double>(w); while (lastPositive == -1 || ((this.positive - lastPositive) > 0)) { steps++; //learnSpeed /= steps; lastPositive = this.positive; var diff = GetDiff(); lastWeight = new Dictionary <string, double>(w); foreach (var label in labels) { w[label] -= diff[label] * learnSpeed; } Console.WriteLine("positive: " + positive); foreach (var label in labels) { Console.Write(label + "\t" + w[label] + "\t"); } Console.WriteLine(""); } FileWriter writer = new LargeFileWriter(@"D:\Codes\C#\EntityTyping\Fine-ner\unit test\output\weight.txt", FileMode.Create); foreach (var label in labels) { writer.WriteLine(label + "\t" + lastWeight[label]); } writer.Close(); }
public static void Temp6() { var sourceFile = @"D:\Codes\Project\EntityTyping\Fine-ner\output\conll feature\raw\train.txt"; var desFile = @"D:\Codes\Project\EntityTyping\Fine-ner\output\conll feature\raw\train data in dbpedia info.txt"; var writer = new LargeFileWriter(desFile, FileMode.Create); var coverNumByType = new Dictionary <string, int>(); var totals = new Dictionary <string, int>(); var reader = new EventReaderByLine(sourceFile); while (reader.HasNext()) { var event1 = reader.GetNextEvent(); var rawFeature = event1.Feature.ToList(); try { totals[event1.Label.ToString()] += 1; } catch (Exception) { totals[event1.Label.ToString()] = 1; } if (!rawFeature[(int)Event.Field.dbpediaTypes].Equals("UNKNOW")) { try { coverNumByType[event1.Label.ToString()] += 1; } catch (Exception) { coverNumByType[event1.Label.ToString()] = 1; } } } reader.Close(); foreach (var type in totals.Keys) { writer.WriteLine(type + "\t" + coverNumByType[type] + "\t" + totals[type] + "\t" + (1.0 * coverNumByType[type] / totals[type])); } writer.Close(); }
public static void Temp() { var source = @"D:\Data\DBpedia\mapping based types"; var desDir = ""; var dic = new Dictionary <string, FileWriter>(); var reader = new pml.file.reader.LargeFileReader(source); var des = @"D:\Data\DBpedia\entity type pairs.txt"; var writer = new LargeFileWriter(des, FileMode.Create); string line; System.Text.RegularExpressions.Regex entityRegex = new System.Text.RegularExpressions.Regex(@"/([^>/]+)>\s<"); System.Text.RegularExpressions.Regex typeRegex = new System.Text.RegularExpressions.Regex(@"ontology/(\w+)>\s\.$"); int count = 0; while ((line = reader.ReadLine()) != null) { string entity = null; string type = null; if (entityRegex.IsMatch(line)) { var match = entityRegex.Match(line); entity = match.Groups[1].Value; } if (typeRegex.IsMatch(line)) { var match = typeRegex.Match(line); type = match.Groups[1].Value; } if (entity != null && type != null) { if (++count % 10000 == 0) { Console.WriteLine(count); } writer.WriteLine(entity + "\t" + type); } } reader.Close(); writer.Close(); }
public static void Temp3() { var source = @"D:\Data\DBpedia\redirects.ttl"; var reader = new pml.file.reader.LargeFileReader(source); var des = @"D:\Data\DBpedia\redirects.txt"; var writer = new LargeFileWriter(des, FileMode.Create); string line; System.Text.RegularExpressions.Regex firstRegex = new System.Text.RegularExpressions.Regex(@"/([^>/]+)>\s<"); System.Text.RegularExpressions.Regex secondRegex = new System.Text.RegularExpressions.Regex(@"/(\w+)>\s\.$"); int count = 0; while ((line = reader.ReadLine()) != null) { string first = null; string second = null; if (firstRegex.IsMatch(line)) { var match = firstRegex.Match(line); first = match.Groups[1].Value; } if (secondRegex.IsMatch(line)) { var match = secondRegex.Match(line); second = match.Groups[1].Value; } if (first != null && second != null) { if (++count % 10000 == 0) { Console.WriteLine(count); } writer.WriteLine(first + "\t" + second); } } reader.Close(); writer.Close(); }
public void Refine() { if (low2top == null) { LoadHierarchy(); } var reader = new LargeFileReader(originalFile); // create writers HashSet <string> topTypes = new HashSet <string>(low2top.Values); var writers = new Dictionary <string, FileWriter>(); var paths = new List <string>(); foreach (var type in topTypes) { var path = Path.Combine(this.refinedDir, type.Replace('.', '_') + ".txt"); writers[type] = new LargeFileWriter(path, FileMode.Create); paths.Add(path); } FileWriter writer = null; string line; var numByType = new Dictionary <string, int>(); int count = 0; while ((line = reader.ReadLine()) != null) { if (++count % 10000 == 0) { Console.WriteLine(count); } var array = line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); if (IsValidItem(array)) { try { writer = writers[this.type]; } catch (Exception e) { Console.WriteLine(e.Message); } writer.WriteLine(string.Format("{0}\t{1}\t{2}\t{3}", array[0], array[1], this.type, array[3].Substring(3))); try { numByType[this.type] += 1; } catch (Exception) { numByType[this.type] = 1; } } } reader.Close(); foreach (var w in writers.Values) { w.Close(); } foreach (var p in paths) { File.SetAttributes(p, FileAttributes.ReadOnly); } writer = new LargeFileWriter(statisticInfoFile, FileMode.Create); foreach (var type in numByType.Keys) { writer.WriteLine(type + "\t" + numByType[type]); } writer.Close(); }
/// <summary> /// map types /// </summary> public static void Temp2() { var dbpediaToSatoriDic = new Dictionary <string, Dictionary <string, int> >(); var satoriMentionDic = new Dictionary <string, string>(); var satoriEntityDic = new Dictionary <string, string>(); var dbpedia = @"D:\Data\DBpedia\entity type pairs.txt"; var satori = @"D:\Codes\C#\EntityTyping\Fine-ner\input\feature\train.txt"; var des = @"D:\Codes\C#\EntityTyping\Fine-ner\input\db2satori.txt"; var dbpediaReader = new LargeFileReader(dbpedia); var satoriReader = new LargeFileReader(satori); var writer = new LargeFileWriter(des, FileMode.Create); var line = ""; Dictionary <string, int> dic = null; while ((line = satoriReader.ReadLine()) != null) { var array = line.Split('\t'); satoriMentionDic[array[0]] = array[2]; satoriEntityDic[array[1]] = array[2]; } satoriReader.Close(); System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"\s+"); System.Text.RegularExpressions.Regex deleteBrace = new System.Text.RegularExpressions.Regex(@"\(\w+\)"); var count = 0; while ((line = dbpediaReader.ReadLine()) != null) { if (++count % 10000 == 0) { Console.WriteLine(count); } var array = line.Split('\t'); var entity = deleteBrace.Replace(array[0], ""); entity = regex.Replace(entity, " ").Trim(); if (satoriEntityDic.ContainsKey(entity)) { try { dic = dbpediaToSatoriDic[array[1]]; } catch (Exception) { dic = new Dictionary <string, int>(); dbpediaToSatoriDic[array[1]] = dic; } try { dic[satoriEntityDic[entity]] += 1; } catch (Exception) { dic[satoriEntityDic[entity]] = 1; } } else if (satoriMentionDic.ContainsKey(entity)) { try { dic = dbpediaToSatoriDic[array[1]]; } catch (Exception) { dic = new Dictionary <string, int>(); dbpediaToSatoriDic[array[1]] = dic; } try { dic[satoriMentionDic[entity]] += 1; } catch (Exception) { dic[satoriMentionDic[entity]] = 1; } } } dbpediaReader.Close(); foreach (var item in dbpediaToSatoriDic) { foreach (var d in item.Value) { writer.WriteLine(item.Key + "\t" + d.Key + "\t" + d.Value); } } writer.Close(); }
public void Test() { if (model == null) { Initial(); } var fields = BayesModel.GetFields(sourceFile); FileReader reader = new LargeFileReader(sourceFile); FileWriter writer = new LargeFileWriter(resultFile, FileMode.Create); // actual label-->(prediced label-->times) var detailDic = new Dictionary <string, Dictionary <string, int> >(); var positiveNums = new Dictionary <string, int>(); // positive number by type var predictedNums = new Dictionary <string, int>(); // predicted number by type var actualNums = new Dictionary <string, int>(); // actual number by type Dictionary <string, int> dic = null; Pair <string, Dictionary <string, object> > feature = null; var i = 0; while ((feature = BayesModel.GetFeatureItem(reader, fields)) != null) { i++; var label = feature.first; string predictedLabel = null; try { predictedLabel = Predict(feature.second); } catch (Exception) { Console.WriteLine("Wrong!"); writer.WriteLine(i + "\t" + label + "\tNULL"); continue; } writer.Write(string.Format("{0}\t{1, -30}", i, label)); foreach (var score in this.scores) { writer.Write(string.Format("{0,30}:{1,-10:F2}", score.first, score.second)); } writer.Write("\r"); if (label.Equals(predictedLabel)) { try { positiveNums[label] += 1; } catch (Exception) { positiveNums[label] = 1; } } try { // update predicted number predictedNums[predictedLabel] += 1; } catch (Exception) { predictedNums[predictedLabel] = 1; } try { // update actually number actualNums[label] += 1; } catch (Exception) { actualNums[label] = 1; } // update detail dictionary try { dic = detailDic[label]; } catch (Exception) { dic = new Dictionary <string, int>(); detailDic[label] = dic; } try { dic[predictedLabel] += 1; } catch (Exception) { dic[predictedLabel] = 1; } } var buffer = new StringBuilder(); buffer.Append(string.Format("{0,-30}", "actual label |predicted type")); foreach (var key in this.labels) { buffer.Append(string.Format("{0,-30}", key)); } buffer.Append(string.Format("{0,-30}\r", "recall")); foreach (var key in this.labels) { buffer.Append(string.Format("{0,-30}", key)); dic = detailDic[key]; foreach (var k in this.labels) { buffer.Append(string.Format("{0,-30}", dic[k])); } // recall buffer.Append(string.Format("{0,-30}\r", 1.0 * positiveNums[key] / actualNums[key])); } buffer.Append(string.Format("{0,-30}", "precision")); foreach (var key in this.labels) { buffer.Append(string.Format("{0,-30:f5}", 1.0 * positiveNums[key] / predictedNums[key])); } buffer.Append("\r"); writer.WriteLine(buffer.ToString()); writer.Close(); }
public void EvaluateResult(string resultFile, string evaluationFile) { var reader = new LargeFileReader(resultFile); var line = ""; var result = new Dictionary <string, Dictionary <string, int> >(); // class-->(predicted class --> number) int times = 0; var trueLabelIndex = 1; var predictLabelIndex = 2; var writer = new LargeFileWriter(evaluationFile, FileMode.Create); Dictionary <string, int> dic = null; line = reader.ReadLine(); while ((line = reader.ReadLine()) != null) { var array = line.Split('\t'); try { dic = result[array[trueLabelIndex]]; try { times = dic[array[predictLabelIndex]]; dic[array[predictLabelIndex]] = times + 1; } catch (Exception) { dic[array[predictLabelIndex]] = 1; } } catch (Exception) { dic = new Dictionary <string, int>(); dic[array[2]] = 1; result[array[trueLabelIndex]] = dic; } } reader.Close(); writer.Write("True|Predict"); var keys = result.Keys; foreach (var key in keys) { writer.Write("\t" + key); } writer.WriteLine(""); foreach (var key in keys) { writer.Write(key); var info = result[key]; foreach (var k in keys) { if (info.TryGetValue(k, out times)) { writer.Write("\t" + times); } else { writer.Write("\t" + 0); } } writer.WriteLine(""); } var macroPre = Util.GetMacroPrecision(result); var macroRec = Util.GetMacroRecall(result); var macroF1 = Util.GetF1(macroPre, macroRec); writer.WriteLine("macro-precision: " + macroPre); writer.WriteLine("macro-recall : " + macroRec); writer.WriteLine("macro-F1 : " + macroF1); var microPre = Util.GetMicroPrecision(result); writer.WriteLine("micro-precision: " + microPre); writer.Close(); }
public void SplitData() { var sourceDic = LoadTotalNumByType(); var mentionNumDic = new Dictionary <string, int>(); var uniqueMentionNumDic = new Dictionary <string, HashSet <string> >(); // create reader by file var files = Directory.GetFiles(this.sourceDir); var reader = new LargeFileReader(); // create file path to store train, develop and test data var trainFiles = new List <string>(); var devFiles = new List <string>(); var testFiles = new List <string>(); foreach (var file in files) { trainFiles.Add(Path.Combine(trainDir, Path.GetFileName(file))); devFiles.Add(Path.Combine(developDir, Path.GetFileName(file))); testFiles.Add(Path.Combine(testDir, Path.GetFileName(file))); } var writers = new List <FileWriter>(); // random value generator to seperate develop and test data var random = new Random(); string line; string[] array; int num = 0; int trainNumLimit = 500000; HashSet <string> set = null; int count = 0; int limitMentionNumPerEntity = 10; int numByEntity = 0; int devNumLimit = 4000; int i = 0; files = new string[] { @"E:\Users\v-mipeng\Codes\Projects\EntityTyping\Fine-ner\input\satori\refined-satori\time_event.txt" }; trainFiles.Clear(); trainFiles.Add(@"E:\Users\v-mipeng\Codes\Projects\EntityTyping\Fine-ner\input\satori\train\time_event.txt"); devFiles.Clear(); devFiles.Add(@"E:\Users\v-mipeng\Codes\Projects\EntityTyping\Fine-ner\input\satori\develop\time_event.txt"); testFiles.Clear(); testFiles.Add(@"E:\Users\v-mipeng\Codes\Projects\EntityTyping\Fine-ner\input\satori\test\time_event.txt"); foreach (var file in files) { reader.Open(file); string lastEntity = ""; writers.Clear(); writers.Add(new LargeFileWriter(devFiles[i], FileMode.Create)); writers.Add(new LargeFileWriter(testFiles[i], FileMode.Create)); var trainWriter = new LargeFileWriter(trainFiles[i], FileMode.Create); i++; int devOrTestNum = 0; while ((line = reader.ReadLine()) != null) { if (++count % 10000 == 0) { Console.WriteLine(count); } array = line.Split('\t'); if (array[1].Equals(lastEntity)) { numByEntity++; } else { numByEntity = 1; lastEntity = array[1]; } if (numByEntity > limitMentionNumPerEntity) { continue; } mentionNumDic.TryGetValue(array[2], out num); if (num < trainNumLimit && num < 0.8 * sourceDic[array[2]] / limitMentionNumPerEntity) { SaveForTrain(trainWriter, line); mentionNumDic[array[2]] = num + 1; uniqueMentionNumDic.TryGetValue(array[2], out set); if (set == null) { set = uniqueMentionNumDic[array[2]] = new HashSet <string>(); } if (!set.Contains(array[0])) { set.Add(array[0]); } } else if (devOrTestNum < devNumLimit * 2) { devOrTestNum++; SaveForDevOrTest(writers[random.Next(0, 2)], line); } } reader.Close(); trainWriter.Close(); writers[0].Close(); writers[1].Close(); } var writer = new LargeFileWriter(statisticInfoFile, FileMode.Create); foreach (var key in mentionNumDic.Keys) { writer.WriteLine(key + "\t" + mentionNumDic[key]); writer.WriteLine(key + "\t" + uniqueMentionNumDic[key].Count); } writer.Close(); foreach (var file in trainFiles) { File.SetAttributes(file, FileAttributes.ReadOnly); } foreach (var file in devFiles) { File.SetAttributes(file, FileAttributes.ReadOnly); } foreach (var file in testFiles) { File.SetAttributes(file, FileAttributes.ReadOnly); } }
public static void Temp() { if (false) { var reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\train.txt"); var writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt", FileMode.OpenOrCreate); Dictionary <string, int> numByType = new Dictionary <string, int>(16); String line; String[] array; int count = 0; int num = 0; while ((line = reader.ReadLine()) != null) { count++; if (count % 1000 == 0) { Console.Error.WriteLine(count + " items processed!"); } array = line.Split('\t'); try { num = numByType[array[1]]; } catch (Exception) { num = 0; } if (num > 100000) // do not limit train data number by type { continue; } writer.WriteLine(line); numByType[array[1]] = ++num; } reader.Close(); writer.Close(); } if (false) { string result = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\-1.inst.txt"; string source = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt"; String tmpFile = "./tmp.txt"; FileReader reader1 = new LargeFileReader(result); FileReader reader2 = new LargeFileReader(source); FileWriter writer = new LargeFileWriter(tmpFile, FileMode.OpenOrCreate); String line; String line2; writer.WriteLine(reader1.ReadLine()); while ((line = reader1.ReadLine()) != null) { line2 = reader2.ReadLine(); writer.WriteLine(line2.Split('\t')[0] + "\t" + line.Split(new char[] { '\t' }, 2)[1]); } reader1.Close(); reader2.Close(); writer.Close(); File.Copy(tmpFile, @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\.inst.txt"); File.Delete(tmpFile); } if (false) { string wordTableFile = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\word table\wordTable.txt"; FileReader reader = new LargeFileReader(wordTableFile); FileWriter writer = new LargeFileWriter(); HashSet <string> wordSet = new HashSet <string>(); string line; while ((line = reader.ReadLine()) != null) { //var stemmer = StemmerPool.GetStemmer(); //wordSet.Add(stemmer.Stem(line.Split('\t')[0])[0]); //StemmerPool.ReturnStemmer(stemmer); //stemmer = null; } reader.Close(); writer.Open(wordTableFile); int i = 0; foreach (String word in wordSet) { writer.WriteLine(word + '\t' + (i++)); } writer.Close(); } if (false) { String dir = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\names"; string des = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt"; FileReader reader = new LargeFileReader(); FileWriter writer = new LargeFileWriter(des, FileMode.Create); string[] files = Directory.GetFiles(dir, "*.txt"); string line; foreach (String file in files) { reader.Open(file); while ((line = reader.ReadLine()) != null) { writer.WriteLine(line.Split(',')[0]); } } reader.Close(); writer.Close(); } if (false) { string path1 = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt"; string path2 = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt"; string des = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\tmp.txt"; FileReader reader = new LargeFileReader(path1); FileWriter writer = new LargeFileWriter(des); String line; HashSet <String> set = new HashSet <string>(); String[] array; while ((line = reader.ReadLine()) != null) { set.Add(line); array = line.Split(' '); } reader.Close(); reader.Open(path2); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); if (set.Contains(array[0].ToLower())) { if (!array[1].Equals("people.person")) { set.Remove(array[0].ToLower()); } } } reader.Close(); foreach (String name in set) { writer.WriteLine(name); } writer.Close(); } if (false) { FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\backup\version 1-2\develop.txt"); FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt", FileMode.OpenOrCreate); String line; string[] array; HashSet <string> interestTypes = new HashSet <string>(); interestTypes.Add("people.person"); interestTypes.Add("location.location"); interestTypes.Add("organization.organization"); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); if (interestTypes.Contains(array[1])) { writer.WriteLine(line); } } reader.Close(); writer.Close(); } if (false) { FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt"); FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create); String line; string[] array; string[] pairString; List <Pair <string, int> > list = new List <Pair <string, int> >(); Pair <string, int> pair = new Pair <string, int>(); Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer(); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); for (int i = 1; i < array.Length; i++) { pairString = new string[] { array[i].Substring(0, array[i].LastIndexOf(":")), array[i].Substring(array[i].LastIndexOf(":") + 1) }; pair = new Pair <string, int>(); pair.first = pairString[0]; pair.second = int.Parse(pairString[1]); list.Add(pair); } list.Sort(comparer); foreach (Pair <string, int> item in list) { writer.Write("\t" + item.first + ":" + item.second); } writer.Write("\r"); list.Clear(); } reader.Close(); writer.Close(); } if (true) { FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt"); FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create); String line; string[] lines = new string[3]; string[] array; string[] pairString; Dictionary <string, int>[] dics = new Dictionary <string, int> [3]; List <Pair <string, int> > list = new List <Pair <string, int> >(); Pair <string, int> pair = new Pair <string, int>(); Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer(); for (int i = 0; i < 3; i++) { line = reader.ReadLine(); array = line.Split('\t'); dics[i] = new Dictionary <string, int>(); if (i == 0) { for (int j = 1; j < array.Length; j++) { pairString = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) }; pair = new Pair <string, int>(); pair.first = pairString[0]; pair.second = int.Parse(pairString[1]); dics[i][pair.first] = pair.second; list.Add(pair); } } else { for (int j = 1; j < array.Length; j++) { pairString = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) }; dics[i][pairString[0]] = int.Parse(pairString[1]); } } } list.Sort(comparer); int count = 10; int locNum; int orgNum; foreach (Pair <string, int> item in list) { count++; try { locNum = dics[1][item.first]; } catch (Exception) { locNum = 0; } try { orgNum = dics[2][item.first]; }catch (Exception) { orgNum = 0; } writer.Write("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")"); if (count % 5 == 0) { writer.Write("\r"); } } reader.Close(); writer.Close(); } }