예제 #1
0
 static void createFreqResources(int numthreads, WikiData wikidata, HashSet<int> validwords, HashSet<int> validconcepts, Dictionary<int, int> crosswalk_words, Dictionary<int, int> crosswalk_concepts, HashSet<int> stopwords, int bigram_threshold)
 {
     //populate termfrequency class
     termfrequency freq = new termfrequency();
     Dictionary<int,string> int2word = new Dictionary<int,string>();
     foreach(KeyValuePair<string,int> kvp in wikidata.worddict)
     {
         int2word.Add(kvp.Value,kvp.Key);
     }
     foreach(int wordid in validwords)
     {
         freq.allterms.TryAdd(crosswalk_words[wordid], new termfrequency.singleterm());
         freq.allterms[crosswalk_words[wordid]].word = int2word[wordid];
     }
     //loop over all words and bigrams in all concepts
     DateTimeOffset currentTime = DateTimeOffset.Now;
     DateTimeOffset startTime = DateTimeOffset.Now;
     int conceptcounter = 0;
     Console.WriteLine("Aggregating words and bigrams across concepts:");
     foreach (int id in validconcepts)
     {
         int mappedid = crosswalk_concepts[id];
         conceptcounter++;
         int firstbigram = -1;
         int[] stream=wikidata.conceptdata[id].conceptwords;
         for (int i = 0; i < stream.Length; i++)
         {
             int wordid = stream[i];
             if (stopwords.Contains(wordid) || !validwords.Contains(wordid))
             {
                 wordid = -1;
             }
             else
             {
                 if (wordid >= 0)
                 {
                     wordid = crosswalk_words[wordid];
                 }
             }
             //not in idf
             if (wordid == -1)
             {
                 firstbigram = -1;
             }
             else
             {
                 if (!freq.allterms[wordid].conceptfrequency.ContainsKey(mappedid))
                 {
                     freq.allterms[wordid].conceptfrequency.Add(mappedid, 1);
                 }
                 else
                 {
                     freq.allterms[wordid].conceptfrequency[mappedid]++;
                 }
                 if (firstbigram != -1)
                 {
                     if (!freq.allterms[firstbigram].bigrams.ContainsKey(wordid))
                     {
                         freq.allterms[firstbigram].bigrams.Add(wordid, new Dictionary<int, int>());
                     }
                     if (!freq.allterms[firstbigram].bigrams[wordid].ContainsKey(mappedid))
                     {
                         freq.allterms[firstbigram].bigrams[wordid].Add(mappedid, 1);
                     }
                     else
                     {
                         freq.allterms[firstbigram].bigrams[wordid][mappedid]++;
                     }
                     firstbigram = wordid;
                 }
                 else
                 {
                     firstbigram = wordid;
                 }
             }
         }
         if ((DateTimeOffset.Now - currentTime).TotalSeconds > 1)
         {
             double done = 1.0 * conceptcounter / validconcepts.Count * 100;
             Console.Write("{0:F2} percent done\r", done);
             currentTime = DateTimeOffset.Now;
         }
     }
     Console.WriteLine("100 percent done in {0:F2} minutes.",(DateTimeOffset.Now-startTime).TotalMinutes);
     //saving data
     Console.WriteLine("Saving frequency data ...");
     startTime = DateTimeOffset.Now;
     freq.saveData(bigram_threshold);
     Console.WriteLine("Frequency data saved in {0:F2} minutes ...",(DateTimeOffset.Now - startTime).TotalMinutes);
 }
예제 #2
0
        static void createFreqResources(int numthreads, WikiData wikidata, HashSet <int> validwords, HashSet <int> validconcepts, Dictionary <int, int> crosswalk_words, Dictionary <int, int> crosswalk_concepts, HashSet <int> stopwords, int bigram_threshold)
        {
            //populate termfrequency class
            termfrequency            freq     = new termfrequency();
            Dictionary <int, string> int2word = new Dictionary <int, string>();

            foreach (KeyValuePair <string, int> kvp in wikidata.worddict)
            {
                int2word.Add(kvp.Value, kvp.Key);
            }
            foreach (int wordid in validwords)
            {
                freq.allterms.TryAdd(crosswalk_words[wordid], new termfrequency.singleterm());
                freq.allterms[crosswalk_words[wordid]].word = int2word[wordid];
            }
            //loop over all words and bigrams in all concepts
            DateTimeOffset currentTime    = DateTimeOffset.Now;
            DateTimeOffset startTime      = DateTimeOffset.Now;
            int            conceptcounter = 0;

            Console.WriteLine("Aggregating words and bigrams across concepts:");
            foreach (int id in validconcepts)
            {
                int mappedid = crosswalk_concepts[id];
                conceptcounter++;
                int   firstbigram = -1;
                int[] stream      = wikidata.conceptdata[id].conceptwords;
                for (int i = 0; i < stream.Length; i++)
                {
                    int wordid = stream[i];
                    if (stopwords.Contains(wordid) || !validwords.Contains(wordid))
                    {
                        wordid = -1;
                    }
                    else
                    {
                        if (wordid >= 0)
                        {
                            wordid = crosswalk_words[wordid];
                        }
                    }
                    //not in idf
                    if (wordid == -1)
                    {
                        firstbigram = -1;
                    }
                    else
                    {
                        if (!freq.allterms[wordid].conceptfrequency.ContainsKey(mappedid))
                        {
                            freq.allterms[wordid].conceptfrequency.Add(mappedid, 1);
                        }
                        else
                        {
                            freq.allterms[wordid].conceptfrequency[mappedid]++;
                        }
                        if (firstbigram != -1)
                        {
                            if (!freq.allterms[firstbigram].bigrams.ContainsKey(wordid))
                            {
                                freq.allterms[firstbigram].bigrams.Add(wordid, new Dictionary <int, int>());
                            }
                            if (!freq.allterms[firstbigram].bigrams[wordid].ContainsKey(mappedid))
                            {
                                freq.allterms[firstbigram].bigrams[wordid].Add(mappedid, 1);
                            }
                            else
                            {
                                freq.allterms[firstbigram].bigrams[wordid][mappedid]++;
                            }
                            firstbigram = wordid;
                        }
                        else
                        {
                            firstbigram = wordid;
                        }
                    }
                }
                if ((DateTimeOffset.Now - currentTime).TotalSeconds > 1)
                {
                    double done = 1.0 * conceptcounter / validconcepts.Count * 100;
                    Console.Write("{0:F2} percent done\r", done);
                    currentTime = DateTimeOffset.Now;
                }
            }
            Console.WriteLine("100 percent done in {0:F2} minutes.", (DateTimeOffset.Now - startTime).TotalMinutes);
            //saving data
            Console.WriteLine("Saving frequency data ...");
            startTime = DateTimeOffset.Now;
            freq.saveData(bigram_threshold);
            Console.WriteLine("Frequency data saved in {0:F2} minutes ...", (DateTimeOffset.Now - startTime).TotalMinutes);
        }