static void ParseWord2Vec(string word2vecFile, string parsedFile) { var writer = new LargeFileWriter(parsedFile, FileMode.Create); var parser = new ParseBinaryVector(word2vecFile); int count = 0; while (!parser.EOF) { if (++count % 1000 == 0) { Console.WriteLine(count); } try { var pair = parser.GetNextVector(); var index = pair.first.IndexOf("en/"); writer.Write(pair.first.Substring(index + 3)); foreach (var value in pair.second) { writer.Write(string.Format(" {0}", value)); } writer.Write("\r"); } catch (Exception) { continue; } } writer.Close(); }
private void OutputTfIdf() { var writer = new LargeFileWriter(vectorPath, FileMode.Create); int docLabel = 1; int num = 0; ReadOneDoc(); while (this.doc != null) { var tokenizer = TokenizerPool.GetTokenizer(); var document = tokenizer.Tokenize(doc); TokenizerPool.ReturnTokenizer(tokenizer); if (++num % 1000 == 0) { Console.WriteLine(num); } var vector = GetTfIdf(document); writer.Write(docLabel); foreach (var value in vector) { writer.Write("\t" + value.first + ":" + value.second); } writer.Write("\r"); ReadOneDoc(); docLabel++; } writer.Close(); }
// save the centroid of clusters private void SaveCentroids() { var writer = new LargeFileWriter(centroidInfoFile, FileMode.Create); //foreach (var centroid in kmeans.Clusters.Centroids) foreach (var centroid in kmeans.Centroids) { foreach (var value in centroid) { writer.Write(string.Format("{0}\t", value)); } writer.Write("\r"); } writer.Close(); }
/// <summary> /// Extract features for bayes model /// </summary> /// <param name="source"> /// File path storing the data from which this program extract features. /// </param> /// <param name="des"> /// File path to store the extracted features. /// </param> private static void ExtractBayesFeature(string source, string des) { FileReader reader = new LargeFileReader(source); FileWriter writer = new LargeFileWriter(des, FileMode.Create); var lines = reader.ReadAllLines().ToList(); const int numPerThread = 10000; var threadNum = (int)Math.Ceiling(1.0 * lines.Count / numPerThread); var childThreads = new Thread[threadNum]; var tmpFiles = new string[threadNum]; for (var i = 0; i < threadNum; i++) { tmpFiles[i] = "./tmp" + i + ".txt"; var threadClass = new BayesFeatureThread(lines.GetRange(numPerThread * i, Math.Min(numPerThread, lines.Count - numPerThread * i)), tmpFiles[i]); childThreads[i] = new Thread(threadClass.ThreadMain); childThreads[i].Name = "thread " + i; childThreads[i].Start(); } for (var i = 0; i < threadNum; i++) { childThreads[i].Join(); } foreach (var tmpFile in tmpFiles) { var text = File.ReadAllText(tmpFile); writer.Write(text); File.Delete(tmpFile); } }
/// <summary> /// Refine disambiguations file download from dbpedia /// </summary> /// <param name="sourceFile"></param> /// <param name="desFile"></param> public static void RefineAmbiguousItem(string sourceFile, string desFile) { var reader = new LargeFileReader(sourceFile); var writer = new LargeFileWriter(desFile, System.IO.FileMode.Create); var line = ""; System.Text.RegularExpressions.Regex sourceRegex = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>"); System.Text.RegularExpressions.Regex deleteBraceRegex = new System.Text.RegularExpressions.Regex(@"_?\([^\)]+\)"); System.Text.RegularExpressions.Regex desRegex = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>\s\.$"); var dic = new Dictionary <string, List <string> >(300000); List <string> list = null; reader.ReadLine(); while ((line = reader.ReadLine()) != null) { var sourceMatch = sourceRegex.Match(line); var source = sourceMatch.Groups[1].Value; source = deleteBraceRegex.Replace(source, ""); var desMatch = desRegex.Match(line); if (dic.TryGetValue(source, out list)) { list.Add(desMatch.Groups[1].Value); } else { list = new List <string>(); list.Add(desMatch.Groups[1].Value); dic[source] = list; } } reader.Close(); foreach (var item in dic) { writer.Write(item.Key); foreach (var des in item.Value) { writer.Write("\t" + des); } writer.WriteLine(""); } writer.Close(); }
public static void SelectInterestWordVector(string interestWordFile, string word2vecFile, string compressedWord2VectorFile) { var reader = new LargeFileReader(interestWordFile); string line; var set = new HashSet <string>(); while ((line = reader.ReadLine()) != null) { set.Add(line.Trim()); } reader.Close(); var writer = new LargeFileWriter(compressedWord2VectorFile, FileMode.Create); var parser = new ParseBinaryVector(word2vecFile); int count = 0; while (!parser.EOF) { if (++count % 1000 == 0) { Console.WriteLine(count); } try { var pair = parser.GetNextVector(); if (set.Contains(pair.first)) { writer.Write(pair.first); foreach (var value in pair.second) { writer.Write(string.Format(" {0}", value)); } writer.Write("\r"); } } catch (Exception) { continue; } } writer.Close(); }
public static void Temp() { if (false) { var reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\train.txt"); var writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt", FileMode.OpenOrCreate); Dictionary <string, int> numByType = new Dictionary <string, int>(16); String line; String[] array; int count = 0; int num = 0; while ((line = reader.ReadLine()) != null) { count++; if (count % 1000 == 0) { Console.Error.WriteLine(count + " items processed!"); } array = line.Split('\t'); try { num = numByType[array[1]]; } catch (Exception) { num = 0; } if (num > 100000) // do not limit train data number by type { continue; } writer.WriteLine(line); numByType[array[1]] = ++num; } reader.Close(); writer.Close(); } if (false) { string result = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\-1.inst.txt"; string source = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt"; String tmpFile = "./tmp.txt"; FileReader reader1 = new LargeFileReader(result); FileReader reader2 = new LargeFileReader(source); FileWriter writer = new LargeFileWriter(tmpFile, FileMode.OpenOrCreate); String line; String line2; writer.WriteLine(reader1.ReadLine()); while ((line = reader1.ReadLine()) != null) { line2 = reader2.ReadLine(); writer.WriteLine(line2.Split('\t')[0] + "\t" + line.Split(new char[] { '\t' }, 2)[1]); } reader1.Close(); reader2.Close(); writer.Close(); File.Copy(tmpFile, @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\.inst.txt"); File.Delete(tmpFile); } if (false) { string wordTableFile = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\word table\wordTable.txt"; FileReader reader = new LargeFileReader(wordTableFile); FileWriter writer = new LargeFileWriter(); HashSet <string> wordSet = new HashSet <string>(); string line; while ((line = reader.ReadLine()) != null) { //var stemmer = StemmerPool.GetStemmer(); //wordSet.Add(stemmer.Stem(line.Split('\t')[0])[0]); //StemmerPool.ReturnStemmer(stemmer); //stemmer = null; } reader.Close(); writer.Open(wordTableFile); int i = 0; foreach (String word in wordSet) { writer.WriteLine(word + '\t' + (i++)); } writer.Close(); } if (false) { String dir = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\names"; string des = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt"; FileReader reader = new LargeFileReader(); FileWriter writer = new LargeFileWriter(des, FileMode.Create); string[] files = Directory.GetFiles(dir, "*.txt"); string line; foreach (String file in files) { reader.Open(file); while ((line = reader.ReadLine()) != null) { writer.WriteLine(line.Split(',')[0]); } } reader.Close(); writer.Close(); } if (false) { string path1 = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt"; string path2 = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt"; string des = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\tmp.txt"; FileReader reader = new LargeFileReader(path1); FileWriter writer = new LargeFileWriter(des); String line; HashSet <String> set = new HashSet <string>(); String[] array; while ((line = reader.ReadLine()) != null) { set.Add(line); array = line.Split(' '); } reader.Close(); reader.Open(path2); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); if (set.Contains(array[0].ToLower())) { if (!array[1].Equals("people.person")) { set.Remove(array[0].ToLower()); } } } reader.Close(); foreach (String name in set) { writer.WriteLine(name); } writer.Close(); } if (false) { FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\backup\version 1-2\develop.txt"); FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt", FileMode.OpenOrCreate); String line; string[] array; HashSet <string> interestTypes = new HashSet <string>(); interestTypes.Add("people.person"); interestTypes.Add("location.location"); interestTypes.Add("organization.organization"); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); if (interestTypes.Contains(array[1])) { writer.WriteLine(line); } } reader.Close(); writer.Close(); } if (false) { FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt"); FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create); String line; string[] array; string[] pairString; List <Pair <string, int> > list = new List <Pair <string, int> >(); Pair <string, int> pair = new Pair <string, int>(); Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer(); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); for (int i = 1; i < array.Length; i++) { pairString = new string[] { array[i].Substring(0, array[i].LastIndexOf(":")), array[i].Substring(array[i].LastIndexOf(":") + 1) }; pair = new Pair <string, int>(); pair.first = pairString[0]; pair.second = int.Parse(pairString[1]); list.Add(pair); } list.Sort(comparer); foreach (Pair <string, int> item in list) { writer.Write("\t" + item.first + ":" + item.second); } writer.Write("\r"); list.Clear(); } reader.Close(); writer.Close(); } if (true) { FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt"); FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create); String line; string[] lines = new string[3]; string[] array; string[] pairString; Dictionary <string, int>[] dics = new Dictionary <string, int> [3]; List <Pair <string, int> > list = new List <Pair <string, int> >(); Pair <string, int> pair = new Pair <string, int>(); Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer(); for (int i = 0; i < 3; i++) { line = reader.ReadLine(); array = line.Split('\t'); dics[i] = new Dictionary <string, int>(); if (i == 0) { for (int j = 1; j < array.Length; j++) { pairString = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) }; pair = new Pair <string, int>(); pair.first = pairString[0]; pair.second = int.Parse(pairString[1]); dics[i][pair.first] = pair.second; list.Add(pair); } } else { for (int j = 1; j < array.Length; j++) { pairString = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) }; dics[i][pairString[0]] = int.Parse(pairString[1]); } } } list.Sort(comparer); int count = 10; int locNum; int orgNum; foreach (Pair <string, int> item in list) { count++; try { locNum = dics[1][item.first]; } catch (Exception) { locNum = 0; } try { orgNum = dics[2][item.first]; }catch (Exception) { orgNum = 0; } writer.Write("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")"); if (count % 5 == 0) { writer.Write("\r"); } } reader.Close(); writer.Close(); } }
public void EvaluateResult(string resultFile, string evaluationFile) { var reader = new LargeFileReader(resultFile); var line = ""; var result = new Dictionary <string, Dictionary <string, int> >(); // class-->(predicted class --> number) int times = 0; var trueLabelIndex = 1; var predictLabelIndex = 2; var writer = new LargeFileWriter(evaluationFile, FileMode.Create); Dictionary <string, int> dic = null; line = reader.ReadLine(); while ((line = reader.ReadLine()) != null) { var array = line.Split('\t'); try { dic = result[array[trueLabelIndex]]; try { times = dic[array[predictLabelIndex]]; dic[array[predictLabelIndex]] = times + 1; } catch (Exception) { dic[array[predictLabelIndex]] = 1; } } catch (Exception) { dic = new Dictionary <string, int>(); dic[array[2]] = 1; result[array[trueLabelIndex]] = dic; } } reader.Close(); writer.Write("True|Predict"); var keys = result.Keys; foreach (var key in keys) { writer.Write("\t" + key); } writer.WriteLine(""); foreach (var key in keys) { writer.Write(key); var info = result[key]; foreach (var k in keys) { if (info.TryGetValue(k, out times)) { writer.Write("\t" + times); } else { writer.Write("\t" + 0); } } writer.WriteLine(""); } var macroPre = Util.GetMacroPrecision(result); var macroRec = Util.GetMacroRecall(result); var macroF1 = Util.GetF1(macroPre, macroRec); writer.WriteLine("macro-precision: " + macroPre); writer.WriteLine("macro-recall : " + macroRec); writer.WriteLine("macro-F1 : " + macroF1); var microPre = Util.GetMicroPrecision(result); writer.WriteLine("micro-precision: " + microPre); writer.Close(); }
public void Test() { if (model == null) { Initial(); } var fields = BayesModel.GetFields(sourceFile); FileReader reader = new LargeFileReader(sourceFile); FileWriter writer = new LargeFileWriter(resultFile, FileMode.Create); // actual label-->(prediced label-->times) var detailDic = new Dictionary <string, Dictionary <string, int> >(); var positiveNums = new Dictionary <string, int>(); // positive number by type var predictedNums = new Dictionary <string, int>(); // predicted number by type var actualNums = new Dictionary <string, int>(); // actual number by type Dictionary <string, int> dic = null; Pair <string, Dictionary <string, object> > feature = null; var i = 0; while ((feature = BayesModel.GetFeatureItem(reader, fields)) != null) { i++; var label = feature.first; string predictedLabel = null; try { predictedLabel = Predict(feature.second); } catch (Exception) { Console.WriteLine("Wrong!"); writer.WriteLine(i + "\t" + label + "\tNULL"); continue; } writer.Write(string.Format("{0}\t{1, -30}", i, label)); foreach (var score in this.scores) { writer.Write(string.Format("{0,30}:{1,-10:F2}", score.first, score.second)); } writer.Write("\r"); if (label.Equals(predictedLabel)) { try { positiveNums[label] += 1; } catch (Exception) { positiveNums[label] = 1; } } try { // update predicted number predictedNums[predictedLabel] += 1; } catch (Exception) { predictedNums[predictedLabel] = 1; } try { // update actually number actualNums[label] += 1; } catch (Exception) { actualNums[label] = 1; } // update detail dictionary try { dic = detailDic[label]; } catch (Exception) { dic = new Dictionary <string, int>(); detailDic[label] = dic; } try { dic[predictedLabel] += 1; } catch (Exception) { dic[predictedLabel] = 1; } } var buffer = new StringBuilder(); buffer.Append(string.Format("{0,-30}", "actual label |predicted type")); foreach (var key in this.labels) { buffer.Append(string.Format("{0,-30}", key)); } buffer.Append(string.Format("{0,-30}\r", "recall")); foreach (var key in this.labels) { buffer.Append(string.Format("{0,-30}", key)); dic = detailDic[key]; foreach (var k in this.labels) { buffer.Append(string.Format("{0,-30}", dic[k])); } // recall buffer.Append(string.Format("{0,-30}\r", 1.0 * positiveNums[key] / actualNums[key])); } buffer.Append(string.Format("{0,-30}", "precision")); foreach (var key in this.labels) { buffer.Append(string.Format("{0,-30:f5}", 1.0 * positiveNums[key] / predictedNums[key])); } buffer.Append("\r"); writer.WriteLine(buffer.ToString()); writer.Close(); }