public static void Temp() { if (false) { var reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\train.txt"); var writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt", FileMode.OpenOrCreate); Dictionary <string, int> numByType = new Dictionary <string, int>(16); String line; String[] array; int count = 0; int num = 0; while ((line = reader.ReadLine()) != null) { count++; if (count % 1000 == 0) { Console.Error.WriteLine(count + " items processed!"); } array = line.Split('\t'); try { num = numByType[array[1]]; } catch (Exception) { num = 0; } if (num > 100000) // do not limit train data number by type { continue; } writer.WriteLine(line); numByType[array[1]] = ++num; } reader.Close(); writer.Close(); } if (false) { string result = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\-1.inst.txt"; string source = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt"; String tmpFile = "./tmp.txt"; FileReader reader1 = new LargeFileReader(result); FileReader reader2 = new LargeFileReader(source); FileWriter writer = new LargeFileWriter(tmpFile, FileMode.OpenOrCreate); String line; String line2; writer.WriteLine(reader1.ReadLine()); while ((line = reader1.ReadLine()) != null) { line2 = reader2.ReadLine(); writer.WriteLine(line2.Split('\t')[0] + "\t" + line.Split(new char[] { '\t' }, 2)[1]); } reader1.Close(); reader2.Close(); writer.Close(); File.Copy(tmpFile, @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\.inst.txt"); File.Delete(tmpFile); } if (false) { string wordTableFile = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\word table\wordTable.txt"; FileReader reader = new LargeFileReader(wordTableFile); FileWriter writer = new LargeFileWriter(); HashSet <string> wordSet = new HashSet <string>(); string line; while ((line = reader.ReadLine()) != null) { //var stemmer = StemmerPool.GetStemmer(); //wordSet.Add(stemmer.Stem(line.Split('\t')[0])[0]); //StemmerPool.ReturnStemmer(stemmer); //stemmer = null; } reader.Close(); writer.Open(wordTableFile); int i = 0; foreach (String word in wordSet) { writer.WriteLine(word + '\t' + (i++)); } writer.Close(); } if (false) { String dir = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\names"; string des = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt"; FileReader reader = new LargeFileReader(); FileWriter writer = new LargeFileWriter(des, FileMode.Create); string[] files = Directory.GetFiles(dir, "*.txt"); string line; foreach (String file in files) { reader.Open(file); while ((line = reader.ReadLine()) != null) { writer.WriteLine(line.Split(',')[0]); } } reader.Close(); writer.Close(); } if (false) { string path1 = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt"; string path2 = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt"; string des = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\tmp.txt"; FileReader reader = new LargeFileReader(path1); FileWriter writer = new LargeFileWriter(des); String line; HashSet <String> set = new HashSet <string>(); String[] array; while ((line = reader.ReadLine()) != null) { set.Add(line); array = line.Split(' '); } reader.Close(); reader.Open(path2); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); if (set.Contains(array[0].ToLower())) { if (!array[1].Equals("people.person")) { set.Remove(array[0].ToLower()); } } } reader.Close(); foreach (String name in set) { writer.WriteLine(name); } writer.Close(); } if (false) { FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\backup\version 1-2\develop.txt"); FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt", FileMode.OpenOrCreate); String line; string[] array; HashSet <string> interestTypes = new HashSet <string>(); interestTypes.Add("people.person"); interestTypes.Add("location.location"); interestTypes.Add("organization.organization"); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); if (interestTypes.Contains(array[1])) { writer.WriteLine(line); } } reader.Close(); writer.Close(); } if (false) { FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt"); FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create); String line; string[] array; string[] pairString; List <Pair <string, int> > list = new List <Pair <string, int> >(); Pair <string, int> pair = new Pair <string, int>(); Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer(); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); for (int i = 1; i < array.Length; i++) { pairString = new string[] { array[i].Substring(0, array[i].LastIndexOf(":")), array[i].Substring(array[i].LastIndexOf(":") + 1) }; pair = new Pair <string, int>(); pair.first = pairString[0]; pair.second = int.Parse(pairString[1]); list.Add(pair); } list.Sort(comparer); foreach (Pair <string, int> item in list) { writer.Write("\t" + item.first + ":" + item.second); } writer.Write("\r"); list.Clear(); } reader.Close(); writer.Close(); } if (true) { FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt"); FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create); String line; string[] lines = new string[3]; string[] array; string[] pairString; Dictionary <string, int>[] dics = new Dictionary <string, int> [3]; List <Pair <string, int> > list = new List <Pair <string, int> >(); Pair <string, int> pair = new Pair <string, int>(); Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer(); for (int i = 0; i < 3; i++) { line = reader.ReadLine(); array = line.Split('\t'); dics[i] = new Dictionary <string, int>(); if (i == 0) { for (int j = 1; j < array.Length; j++) { pairString = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) }; pair = new Pair <string, int>(); pair.first = pairString[0]; pair.second = int.Parse(pairString[1]); dics[i][pair.first] = pair.second; list.Add(pair); } } else { for (int j = 1; j < array.Length; j++) { pairString = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) }; dics[i][pairString[0]] = int.Parse(pairString[1]); } } } list.Sort(comparer); int count = 10; int locNum; int orgNum; foreach (Pair <string, int> item in list) { count++; try { locNum = dics[1][item.first]; } catch (Exception) { locNum = 0; } try { orgNum = dics[2][item.first]; }catch (Exception) { orgNum = 0; } writer.Write("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")"); if (count % 5 == 0) { writer.Write("\r"); } } reader.Close(); writer.Close(); } }