コード例 #1
0
        public void Process(string categoryFile, string postagFile,string gramCategoryFile)
        {
            int order = 0;
            Dictionary<int, string> orderCategory = new Dictionary<int, string>();
            string currentLine;
            StreamReader reader = new StreamReader(categoryFile);

            while ((currentLine = reader.ReadLine()) != null)
            {
                string category = currentLine.Split(' ')[0];
                if (category != "NUM:date")
                    category = category.Split(':')[0];
                orderCategory.Add(order, category);
                order++;
            }
            reader.Close();

            reader = new StreamReader(postagFile);

            CFormalizeString stringFormalizor = new CFormalizeString();
            StreamWriter writer = new StreamWriter(gramCategoryFile);
            Dictionary<string, Dictionary<string, int>> bigramCategoryF = new Dictionary<string, Dictionary<string, int>>();
            order=0;
            while ((currentLine = reader.ReadLine()) != null)
            {
                string fullLine = currentLine;
                while (fullLine.Contains('?') == false)
                {
                    currentLine = reader.ReadLine();
                    fullLine = fullLine + " " + currentLine;
                }
                string[] tokens = fullLine.ToLower().Split(' ');
                string bigram="";
                string category=orderCategory[order];
                order++;

                if (tokens[0].StartsWith("what"))
                {
                    foreach(string token in tokens)
                        if (token.Contains("_n"))
                        {
                            bigram="what"+" "+token.Split('_')[0];
                            break;
                        }
                }
                else
                {
                    bigram = tokens[0].Split('_')[0] + " " + tokens[1].Split('_')[0];
                }
                if (bigram!="")
                {
                    bigram = stringFormalizor.FormalizeString(bigram);
                    if (bigramCategoryF.ContainsKey(bigram) == false)
                        bigramCategoryF.Add(bigram,new Dictionary<string,int>());
                    if (bigramCategoryF[bigram].ContainsKey(category) == false)
                        bigramCategoryF[bigram].Add(category, 0);
                    bigramCategoryF[bigram][category]++;
                }
            }
            reader.Close();

            foreach (string bigram in bigramCategoryF.Keys)
            {
                int maxF = 0;
                string maxCategory = "";
                foreach (string category in bigramCategoryF[bigram].Keys)
                    if (bigramCategoryF[bigram][category] > maxF)
                    {
                        maxF = bigramCategoryF[bigram][category];
                        maxCategory = category;
                    }
                writer.WriteLine(bigram + "\t" + maxCategory);
            }

            writer.Close();
        }
コード例 #2
0
        void GenerateFamousIdSet(string inputFile)
        {
            StreamReader reader = new StreamReader(inputFile);
            string currentLine;
            CFormalizeString sf=new CFormalizeString();

            int lineCount = 0;
            while ((currentLine = reader.ReadLine()) != null)
            {
                lineCount++;
                if (lineCount % 1000000 == 0)
                {
                    Console.WriteLine("depth:{0}\tline:{1}\tcount:{2}", depth, lineCount,idPathList.Count);
                    //break;
                }
                string[] tokens=currentLine.Split('\t');
                if ((tokens[1]=="<type.object.name>" || tokens[1] == "<common.topic.alias>" )
                    && famousEntitySet.Contains(tokens[2].ToLower())
                    && IsId(tokens[0]))
                {
                    if (idPathList.ContainsKey(tokens[0])==false)
                        idPathList.Add(tokens[0],new List<string>());
                    idPathList[tokens[0]].Add(tokens[2].ToLower() + "\t");
                }
            }
            reader.Close();
        }