public void Process(string categoryFile, string postagFile,string gramCategoryFile) { int order = 0; Dictionary<int, string> orderCategory = new Dictionary<int, string>(); string currentLine; StreamReader reader = new StreamReader(categoryFile); while ((currentLine = reader.ReadLine()) != null) { string category = currentLine.Split(' ')[0]; if (category != "NUM:date") category = category.Split(':')[0]; orderCategory.Add(order, category); order++; } reader.Close(); reader = new StreamReader(postagFile); CFormalizeString stringFormalizor = new CFormalizeString(); StreamWriter writer = new StreamWriter(gramCategoryFile); Dictionary<string, Dictionary<string, int>> bigramCategoryF = new Dictionary<string, Dictionary<string, int>>(); order=0; while ((currentLine = reader.ReadLine()) != null) { string fullLine = currentLine; while (fullLine.Contains('?') == false) { currentLine = reader.ReadLine(); fullLine = fullLine + " " + currentLine; } string[] tokens = fullLine.ToLower().Split(' '); string bigram=""; string category=orderCategory[order]; order++; if (tokens[0].StartsWith("what")) { foreach(string token in tokens) if (token.Contains("_n")) { bigram="what"+" "+token.Split('_')[0]; break; } } else { bigram = tokens[0].Split('_')[0] + " " + tokens[1].Split('_')[0]; } if (bigram!="") { bigram = stringFormalizor.FormalizeString(bigram); if (bigramCategoryF.ContainsKey(bigram) == false) bigramCategoryF.Add(bigram,new Dictionary<string,int>()); if (bigramCategoryF[bigram].ContainsKey(category) == false) bigramCategoryF[bigram].Add(category, 0); bigramCategoryF[bigram][category]++; } } reader.Close(); foreach (string bigram in bigramCategoryF.Keys) { int maxF = 0; string maxCategory = ""; foreach (string category in bigramCategoryF[bigram].Keys) if (bigramCategoryF[bigram][category] > maxF) { maxF = bigramCategoryF[bigram][category]; maxCategory = category; } writer.WriteLine(bigram + "\t" + maxCategory); } writer.Close(); }
void GenerateFamousIdSet(string inputFile) { StreamReader reader = new StreamReader(inputFile); string currentLine; CFormalizeString sf=new CFormalizeString(); int lineCount = 0; while ((currentLine = reader.ReadLine()) != null) { lineCount++; if (lineCount % 1000000 == 0) { Console.WriteLine("depth:{0}\tline:{1}\tcount:{2}", depth, lineCount,idPathList.Count); //break; } string[] tokens=currentLine.Split('\t'); if ((tokens[1]=="<type.object.name>" || tokens[1] == "<common.topic.alias>" ) && famousEntitySet.Contains(tokens[2].ToLower()) && IsId(tokens[0])) { if (idPathList.ContainsKey(tokens[0])==false) idPathList.Add(tokens[0],new List<string>()); idPathList[tokens[0]].Add(tokens[2].ToLower() + "\t"); } } reader.Close(); }