public void Formalize(string inputFile,string outputFile) { StreamReader reader=new StreamReader(inputFile); StreamWriter writer=new StreamWriter(outputFile); string currentLine; while ((currentLine=reader.ReadLine())!=null) { string[] tokens=currentLine.Split('\t'); string pattern=tokens[0]; int f1=Convert.ToInt32(tokens[1]); int f2=Convert.ToInt32(tokens[2]); pattern=pattern.Replace("what s ", "what be "); if (patternFrequency.ContainsKey(pattern)==false) patternFrequency.Add(pattern,0); patternFrequency[pattern]+=f1; if (patternMaxfrequency.ContainsKey(pattern)==false) patternMaxfrequency.Add(pattern,0); patternMaxfrequency[pattern]+=f2; } List<KeyValuePair<string, double>> patternFrequencyList = new List<KeyValuePair<string, double>>(); foreach (string pattern in patternFrequency.Keys) { if (patternMaxfrequency.ContainsKey(pattern) == false) continue; patternFrequencyList.Add(new KeyValuePair<string, double>(pattern + "\t" + patternFrequency[pattern] + "\t" + patternMaxfrequency[pattern], Convert.ToDouble(patternFrequency[pattern]) / patternMaxfrequency[pattern])); } DinoComparerStringDouble dc = new DinoComparerStringDouble(); patternFrequencyList.Sort(dc); foreach (var item in patternFrequencyList) writer.WriteLine(item.Key + "\t" + item.Value); writer.Close(); }
public void SampleTopN(string inputFile, string outputFile, int n) { StreamReader reader = new StreamReader(inputFile); StreamWriter writer = new StreamWriter(outputFile); List<KeyValuePair<string, double>> temp = new List<KeyValuePair<string, double>>(); string currentLine; while ((currentLine = reader.ReadLine()) != null) { if (currentLine.Contains('~')) continue; string[] tokens = currentLine.Split('\t'); if (tokens[0] == tokens[1]) continue; //currentLine = currentLine.Substring(currentLine.IndexOf('\t') + 1); double score=Convert.ToDouble(currentLine.Split('\t').Last()); temp.Add(new KeyValuePair<string,double>(currentLine,score)); } DinoComparerStringDouble dc = new DinoComparerStringDouble(); temp.Sort(dc); Random r = new Random(); for (int i = 0; i < n; i++) { int index = i; writer.WriteLine(temp[index].Key); } reader.Close(); writer.Close(); }
public void Process(string inputFile, string outputFile) { StreamReader reader = new StreamReader(inputFile); Dictionary<string, List<KeyValuePair<string, double>>> pToTf = new Dictionary<string, List<KeyValuePair<string, double>>>(); List<string> predicateList = new List<string>(); string currentLine; int temp = 0; while ((currentLine = reader.ReadLine()) != null) { string[] tokens = currentLine.Split('\t'); string template = tokens[0]; string predicate = tokens[2]; double frequency = Convert.ToInt32(tokens[3]); if (currentLine.Contains("people.") == false) continue; if (currentLine.Contains("-mso/people.person.marriage-mso/time.event.person-mso/type.object.name")) temp++; if (pToTf.ContainsKey(predicate) == false) { pToTf.Add(predicate, new List<KeyValuePair<string, double>>()); predicateList.Add(predicate); } pToTf[predicate].Add(new KeyValuePair<string, double>(currentLine, frequency)); } reader.Close(); Console.WriteLine(temp); Console.ReadKey(); HashSet<string> used = new HashSet<string>(); StreamWriter writer = new StreamWriter(outputFile); Random r = new Random(); for (int i = 0; i < 140; i++) { string predicate = ""; int count = 0; while (true) { count++; predicate = predicateList[r.Next(predicateList.Count)]; if (used.Contains(predicate) == false) break; if (count > 10) break; } if (count > 10) break; used.Add(predicate); DinoComparerStringDouble dc = new DinoComparerStringDouble(); pToTf[predicate].Sort(dc); writer.WriteLine(pToTf[predicate][0].Key); if (pToTf[predicate].Count>1) writer.WriteLine(pToTf[predicate][1].Key); } writer.Close(); }
public void OutputCategoryPathScore(string outputFile) { StreamWriter writer = new StreamWriter(outputFile); List<KeyValuePair<string, double>> temp = new List<KeyValuePair<string, double>>(); foreach (string category in categoryPathScore.Keys) foreach (string path in categoryPathScore[category].Keys) temp.Add(new KeyValuePair<string,double>(category + "\t" + path,categoryPathScore[category][path])); DinoComparerStringDouble dc=new DinoComparerStringDouble(); temp.Sort(dc); foreach(var item in temp) writer.WriteLine(item.Key+"\t"+item.Value); writer.Close(); }
public void OutputPatternFrequency(string outputFile) { List<KeyValuePair<string, double>> patternFrequencyList = new List<KeyValuePair<string, double>>(); foreach (string pattern in patternFrequency.Keys) { if (patternMaxfrequency.ContainsKey(pattern) == false) continue; patternFrequencyList.Add(new KeyValuePair<string, double>(pattern + "\t" + patternFrequency[pattern] + "\t" + patternMaxfrequency[pattern], Convert.ToDouble(patternFrequency[pattern]) / patternMaxfrequency[pattern])); } DinoComparerStringDouble dc = new DinoComparerStringDouble(); patternFrequencyList.Sort(dc); StreamWriter writer = new StreamWriter(outputFile); foreach (var item in patternFrequencyList) writer.WriteLine(item.Key + "\t" + item.Value); writer.Close(); }
public void Process_Bayes(string inputFile, string outputFile) { StreamReader reader = new StreamReader(inputFile); StreamWriter writer = new StreamWriter(outputFile); List<string> goodList = new List<string>(); List<string> badList = new List<string>(); Dictionary<string, int> tokenGoodvalue = new Dictionary<string, int>(); Dictionary<string, int> tokenSumvalue = new Dictionary<string, int>(); Dictionary<string, int> tokenBadvalue = new Dictionary<string, int>(); string currentLine; while ((currentLine = reader.ReadLine()) != null) { currentLine = currentLine.ToLower(); string[] tokens = currentLine.Split('\t'); string lable = tokens[2]; if (tokens[1].Contains("government")) continue; tokens[1] = tokens[1].Replace(' ', '_'); string pattern = tokens[0] + " " + tokens[1]; pattern = pattern.Substring(0, pattern.IndexOf('<')) + " " + pattern.Substring(pattern.IndexOf('>') + 1); if (lable=="true") badList.Add(pattern); else goodList.Add(pattern); } foreach (string pattern in goodList) { string[] tokens = pattern.Split(' '); foreach (string token in tokens) { if (token == "") continue; if (tokenGoodvalue.ContainsKey(token) == false) { tokenGoodvalue.Add(token, 0); tokenBadvalue.Add(token, 0); } tokenGoodvalue[token]++; if (tokenSumvalue.ContainsKey(token) == false) tokenSumvalue.Add(token, 0); tokenSumvalue[token]++; } } foreach (string pattern in badList) { string[] tokens = pattern.Split(' '); foreach (string token in tokens) { if (token == "") continue; if (tokenBadvalue.ContainsKey(token) == false) tokenBadvalue.Add(token, 0); tokenBadvalue[token]++; if (tokenSumvalue.ContainsKey(token) == false) tokenSumvalue.Add(token, 0); tokenSumvalue[token]++; } } List<KeyValuePair<string, double>> temp = new List<KeyValuePair<string, double>>(); foreach (string token in tokenGoodvalue.Keys) temp.Add(new KeyValuePair<string, double>(token, Convert.ToDouble(tokenGoodvalue[token]) / tokenSumvalue[token] / (Convert.ToDouble(tokenBadvalue[token]+1) / tokenSumvalue[token]))); //temp.Add(new KeyValuePair<string, double>(token, Math.Log(tokenGoodvalue[token])*Convert.ToDouble(tokenGoodvalue[token])/tokenSumvalue[token])); //temp.Add(new KeyValuePair<string, double>(token, Convert.ToDouble(tokenGoodvalue[token])*Math.Log(Convert.ToDouble(tokenSumvalue[token])/tokenGoodvalue[token]))); DinoComparerStringDouble dc = new DinoComparerStringDouble(); temp.Sort(dc); foreach (var item in temp) writer.WriteLine(item.Key + "\t" + item.Value + "\t" + tokenGoodvalue[item.Key] + "\t" + tokenBadvalue[item.Key]); reader.Close(); writer.Close(); }
public void OutputCorrelation(string outputFile) { StreamWriter writer = new StreamWriter(outputFile); foreach (string pattern in patternCluepathvalueFrequency.Keys) { List<KeyValuePair<string, double>> a = new List<KeyValuePair<string, double>>(); foreach (string clupathvalue in patternCluepathvalueFrequency[pattern].Keys) if (/*patternCluepathvalueFrequency[pattern][clupathvalue] * 2 > patternHasanswernode[pattern]*/ true) { double temp = Convert.ToDouble(patternCluepathvalueFrequency[pattern][clupathvalue]) / Convert.ToDouble(patternPathvalueFrequency[pattern][clupathvalue]); a.Add(new KeyValuePair<string, double>(clupathvalue + "\t" + patternCluepathvalueFrequency[pattern][clupathvalue] + "\t" + patternPathvalueFrequency[pattern][clupathvalue], temp)); } if (a.Count == 0) continue; DinoComparerStringDouble dc = new DinoComparerStringDouble(); a.Sort(dc); writer.WriteLine(pattern + "\t" + patternFrequency[pattern.Split('\t')[0]]); foreach (var item in a) writer.WriteLine(item.Key + "\t" + item.Value); writer.WriteLine(); } writer.Close(); }
public void Process(string inputFile, string outputFile) { StreamReader reader = new StreamReader(inputFile); StreamWriter writer = new StreamWriter(outputFile); string currentLine; while ((currentLine = reader.ReadLine()) != null) { string[] tokens = currentLine.Split('\t'); string categorypath = tokens[0] + tokens[1]; double frequency = Convert.ToDouble(tokens[2]); if (categorypath.Contains('~')) continue; categorypathFrequency.Add(new KeyValuePair<string, double>(categorypath, frequency)); } DinoComparerStringDouble dc = new DinoComparerStringDouble(); categorypathFrequency.Sort(dc); foreach (var item in categorypathFrequency) writer.WriteLine(item.Key + "\t" + item.Value); reader.Close(); writer.Close(); }
public void GenerateTopPatternPath(string inputFile,string outputFile) { StreamReader reader=new StreamReader(inputFile); StreamWriter writer = new StreamWriter(outputFile); string currentLine; List<KeyValuePair<string,double>> patternpathToScore=new List<KeyValuePair<string,double>>(); while((currentLine=reader.ReadLine())!=null) { string[] tokens=currentLine.Split('\t'); string patternpath = tokens[1] + "\t" + tokens[3]; double score=Convert.ToDouble(tokens[4]); patternpathToScore.Add(new KeyValuePair<string,double>(patternpath,score)); } DinoComparerStringDouble dc=new DinoComparerStringDouble(); patternpathToScore.Sort(dc); int count = 0; foreach (var item in patternpathToScore) { if (item.Key.Contains('~')) continue; count++; if (count == 10000) break; writer.WriteLine(item.Key + "\t" + item.Value); } reader.Close(); writer.Close(); }