예제 #1
0
        public void Formalize(string inputFile,string outputFile)
        {
            StreamReader reader=new StreamReader(inputFile);
            StreamWriter writer=new StreamWriter(outputFile);
            string currentLine;

            while ((currentLine=reader.ReadLine())!=null)
            {
                string[] tokens=currentLine.Split('\t');
                string pattern=tokens[0];
                int f1=Convert.ToInt32(tokens[1]);
                int f2=Convert.ToInt32(tokens[2]);
                pattern=pattern.Replace("what s ", "what be ");

                if (patternFrequency.ContainsKey(pattern)==false)
                    patternFrequency.Add(pattern,0);
                patternFrequency[pattern]+=f1;
                if (patternMaxfrequency.ContainsKey(pattern)==false)
                    patternMaxfrequency.Add(pattern,0);
                patternMaxfrequency[pattern]+=f2;
            }
            List<KeyValuePair<string, double>> patternFrequencyList = new List<KeyValuePair<string, double>>();
            foreach (string pattern in patternFrequency.Keys)
            {
                if (patternMaxfrequency.ContainsKey(pattern) == false)
                    continue;
                patternFrequencyList.Add(new KeyValuePair<string, double>(pattern + "\t" + patternFrequency[pattern] + "\t" + patternMaxfrequency[pattern], Convert.ToDouble(patternFrequency[pattern]) / patternMaxfrequency[pattern]));
            }
            DinoComparerStringDouble dc = new DinoComparerStringDouble();
            patternFrequencyList.Sort(dc);

            foreach (var item in patternFrequencyList)
                writer.WriteLine(item.Key + "\t" + item.Value);
            writer.Close();
        }
        public void SampleTopN(string inputFile, string outputFile, int n)
        {
            StreamReader reader = new StreamReader(inputFile);
            StreamWriter writer = new StreamWriter(outputFile);
            List<KeyValuePair<string, double>> temp = new List<KeyValuePair<string, double>>();
            string currentLine;

            while ((currentLine = reader.ReadLine()) != null)
            {
                if (currentLine.Contains('~')) continue;
                string[] tokens = currentLine.Split('\t');
                if (tokens[0] == tokens[1]) continue;
                //currentLine = currentLine.Substring(currentLine.IndexOf('\t') + 1);
                double score=Convert.ToDouble(currentLine.Split('\t').Last());
                temp.Add(new KeyValuePair<string,double>(currentLine,score));
            }

            DinoComparerStringDouble dc = new DinoComparerStringDouble();
            temp.Sort(dc);

            Random r = new Random();
            for (int i = 0; i < n; i++)
            {
                int index = i;
                writer.WriteLine(temp[index].Key);
            }

            reader.Close();
            writer.Close();
        }
        public void Process(string inputFile, string outputFile)
        {
            StreamReader reader = new StreamReader(inputFile);

            Dictionary<string, List<KeyValuePair<string, double>>> pToTf = new Dictionary<string, List<KeyValuePair<string, double>>>();
            List<string> predicateList = new List<string>();

            string currentLine;
            int temp = 0;

            while ((currentLine = reader.ReadLine()) != null)
            {
                string[] tokens = currentLine.Split('\t');
                string template = tokens[0];
                string predicate = tokens[2];
                double frequency = Convert.ToInt32(tokens[3]);
                if (currentLine.Contains("people.") == false) continue;
                if (currentLine.Contains("-mso/people.person.marriage-mso/time.event.person-mso/type.object.name")) temp++;

                if (pToTf.ContainsKey(predicate) == false)
                {
                    pToTf.Add(predicate, new List<KeyValuePair<string, double>>());
                    predicateList.Add(predicate);
                }
                pToTf[predicate].Add(new KeyValuePair<string, double>(currentLine, frequency));
            }
            reader.Close();
            Console.WriteLine(temp);
            Console.ReadKey();

            HashSet<string> used = new HashSet<string>();
            StreamWriter writer = new StreamWriter(outputFile);

            Random r = new Random();
            for (int i = 0; i < 140; i++)
            {
                string predicate = "";
                int count = 0;
                while (true)
                {
                    count++;
                    predicate = predicateList[r.Next(predicateList.Count)];
                    if (used.Contains(predicate) == false) break;
                    if (count > 10) break;
                }
                if (count > 10) break;
                used.Add(predicate);

                DinoComparerStringDouble dc = new DinoComparerStringDouble();
                pToTf[predicate].Sort(dc);
                writer.WriteLine(pToTf[predicate][0].Key);
                if (pToTf[predicate].Count>1) writer.WriteLine(pToTf[predicate][1].Key);
            }
            writer.Close();
        }
예제 #4
0
        public void OutputCategoryPathScore(string outputFile)
        {
            StreamWriter writer = new StreamWriter(outputFile);
            List<KeyValuePair<string, double>> temp = new List<KeyValuePair<string, double>>();
            foreach (string category in categoryPathScore.Keys)
                foreach (string path in categoryPathScore[category].Keys)
                    temp.Add(new KeyValuePair<string,double>(category + "\t" + path,categoryPathScore[category][path]));

            DinoComparerStringDouble dc=new DinoComparerStringDouble();
            temp.Sort(dc);

            foreach(var item in temp)
                writer.WriteLine(item.Key+"\t"+item.Value);

            writer.Close();
        }
예제 #5
0
        public void OutputPatternFrequency(string outputFile)
        {
            List<KeyValuePair<string, double>> patternFrequencyList = new List<KeyValuePair<string, double>>();
            foreach (string pattern in patternFrequency.Keys)
            {
                if (patternMaxfrequency.ContainsKey(pattern) == false)
                    continue;
                patternFrequencyList.Add(new KeyValuePair<string, double>(pattern + "\t" + patternFrequency[pattern] + "\t" + patternMaxfrequency[pattern], Convert.ToDouble(patternFrequency[pattern]) / patternMaxfrequency[pattern]));
            }
            DinoComparerStringDouble dc = new DinoComparerStringDouble();
            patternFrequencyList.Sort(dc);

            StreamWriter writer = new StreamWriter(outputFile);
            foreach (var item in patternFrequencyList)
                writer.WriteLine(item.Key + "\t" + item.Value);
            writer.Close();
        }
예제 #6
0
        public void Process_Bayes(string inputFile, string outputFile)
        {
            StreamReader reader = new StreamReader(inputFile);
            StreamWriter writer = new StreamWriter(outputFile);
            List<string> goodList = new List<string>();
            List<string> badList = new List<string>();
            Dictionary<string, int> tokenGoodvalue = new Dictionary<string, int>();
            Dictionary<string, int> tokenSumvalue = new Dictionary<string, int>();
            Dictionary<string, int> tokenBadvalue = new Dictionary<string, int>();

            string currentLine;
            while ((currentLine = reader.ReadLine()) != null)
            {
                currentLine = currentLine.ToLower();
                string[] tokens = currentLine.Split('\t');
                string lable = tokens[2];

                if (tokens[1].Contains("government")) continue;

                tokens[1] = tokens[1].Replace(' ', '_');

                string pattern = tokens[0] + " " + tokens[1];
                pattern = pattern.Substring(0, pattern.IndexOf('<')) + " " + pattern.Substring(pattern.IndexOf('>') + 1);

                if (lable=="true") badList.Add(pattern);
                else goodList.Add(pattern);
            }

            foreach (string pattern in goodList)
            {
                string[] tokens = pattern.Split(' ');
                foreach (string token in tokens)
                {
                    if (token == "") continue;
                    if (tokenGoodvalue.ContainsKey(token) == false)
                    {
                        tokenGoodvalue.Add(token, 0);
                        tokenBadvalue.Add(token, 0);
                    }
                    tokenGoodvalue[token]++;
                    if (tokenSumvalue.ContainsKey(token) == false) tokenSumvalue.Add(token, 0);
                    tokenSumvalue[token]++;
                }
            }

            foreach (string pattern in badList)
            {
                string[] tokens = pattern.Split(' ');
                foreach (string token in tokens)
                {
                    if (token == "") continue;
                    if (tokenBadvalue.ContainsKey(token) == false) tokenBadvalue.Add(token, 0);
                    tokenBadvalue[token]++;
                    if (tokenSumvalue.ContainsKey(token) == false) tokenSumvalue.Add(token, 0);
                    tokenSumvalue[token]++;
                }
            }

            List<KeyValuePair<string, double>> temp = new List<KeyValuePair<string, double>>();
            foreach (string token in tokenGoodvalue.Keys)
                temp.Add(new KeyValuePair<string, double>(token, Convert.ToDouble(tokenGoodvalue[token]) / tokenSumvalue[token] / (Convert.ToDouble(tokenBadvalue[token]+1) / tokenSumvalue[token])));
                //temp.Add(new KeyValuePair<string, double>(token, Math.Log(tokenGoodvalue[token])*Convert.ToDouble(tokenGoodvalue[token])/tokenSumvalue[token]));
                //temp.Add(new KeyValuePair<string, double>(token, Convert.ToDouble(tokenGoodvalue[token])*Math.Log(Convert.ToDouble(tokenSumvalue[token])/tokenGoodvalue[token])));

            DinoComparerStringDouble dc = new DinoComparerStringDouble();
            temp.Sort(dc);

            foreach (var item in temp)
                writer.WriteLine(item.Key + "\t" + item.Value + "\t" + tokenGoodvalue[item.Key] + "\t" + tokenBadvalue[item.Key]);

            reader.Close();
            writer.Close();
        }
 public void OutputCorrelation(string outputFile)
 {
     StreamWriter writer = new StreamWriter(outputFile);
     foreach (string pattern in patternCluepathvalueFrequency.Keys)
     {
         List<KeyValuePair<string, double>> a = new List<KeyValuePair<string, double>>();
         foreach (string clupathvalue in patternCluepathvalueFrequency[pattern].Keys)
             if (/*patternCluepathvalueFrequency[pattern][clupathvalue] * 2 > patternHasanswernode[pattern]*/ true)
             {
                 double temp = Convert.ToDouble(patternCluepathvalueFrequency[pattern][clupathvalue]) / Convert.ToDouble(patternPathvalueFrequency[pattern][clupathvalue]);
                 a.Add(new KeyValuePair<string, double>(clupathvalue + "\t" + patternCluepathvalueFrequency[pattern][clupathvalue] + "\t" + patternPathvalueFrequency[pattern][clupathvalue], temp));
             }
         if (a.Count == 0) continue;
         DinoComparerStringDouble dc = new DinoComparerStringDouble();
         a.Sort(dc);
         writer.WriteLine(pattern + "\t" + patternFrequency[pattern.Split('\t')[0]]);
         foreach (var item in a)
             writer.WriteLine(item.Key + "\t" + item.Value);
         writer.WriteLine();
     }
     writer.Close();
 }
        public void Process(string inputFile, string outputFile)
        {
            StreamReader reader = new StreamReader(inputFile);
            StreamWriter writer = new StreamWriter(outputFile);
            string currentLine;

            while ((currentLine = reader.ReadLine()) != null)
            {
                string[] tokens = currentLine.Split('\t');
                string categorypath = tokens[0] + tokens[1];
                double frequency = Convert.ToDouble(tokens[2]);

                if (categorypath.Contains('~')) continue;

                categorypathFrequency.Add(new KeyValuePair<string, double>(categorypath, frequency));
            }

            DinoComparerStringDouble dc = new DinoComparerStringDouble();
            categorypathFrequency.Sort(dc);

            foreach (var item in categorypathFrequency)
                writer.WriteLine(item.Key + "\t" + item.Value);
            reader.Close();
            writer.Close();
        }
        public void GenerateTopPatternPath(string inputFile,string outputFile)
        {
            StreamReader reader=new StreamReader(inputFile);
            StreamWriter writer = new StreamWriter(outputFile);
            string currentLine;
            List<KeyValuePair<string,double>> patternpathToScore=new List<KeyValuePair<string,double>>();

            while((currentLine=reader.ReadLine())!=null)
            {
                string[] tokens=currentLine.Split('\t');

                string patternpath = tokens[1] + "\t" + tokens[3];
                double score=Convert.ToDouble(tokens[4]);

                patternpathToScore.Add(new KeyValuePair<string,double>(patternpath,score));
            }

            DinoComparerStringDouble dc=new DinoComparerStringDouble();
            patternpathToScore.Sort(dc);

            int count = 0;
            foreach (var item in patternpathToScore)
            {
                if (item.Key.Contains('~')) continue;
                count++;
                if (count == 10000) break;
                writer.WriteLine(item.Key + "\t" + item.Value);
            }
            reader.Close();
            writer.Close();
        }