static void createFreqResources(int numthreads, WikiData wikidata, HashSet<int> validwords, HashSet<int> validconcepts, Dictionary<int, int> crosswalk_words, Dictionary<int, int> crosswalk_concepts, HashSet<int> stopwords, int bigram_threshold) { //populate termfrequency class termfrequency freq = new termfrequency(); Dictionary<int,string> int2word = new Dictionary<int,string>(); foreach(KeyValuePair<string,int> kvp in wikidata.worddict) { int2word.Add(kvp.Value,kvp.Key); } foreach(int wordid in validwords) { freq.allterms.TryAdd(crosswalk_words[wordid], new termfrequency.singleterm()); freq.allterms[crosswalk_words[wordid]].word = int2word[wordid]; } //loop over all words and bigrams in all concepts DateTimeOffset currentTime = DateTimeOffset.Now; DateTimeOffset startTime = DateTimeOffset.Now; int conceptcounter = 0; Console.WriteLine("Aggregating words and bigrams across concepts:"); foreach (int id in validconcepts) { int mappedid = crosswalk_concepts[id]; conceptcounter++; int firstbigram = -1; int[] stream=wikidata.conceptdata[id].conceptwords; for (int i = 0; i < stream.Length; i++) { int wordid = stream[i]; if (stopwords.Contains(wordid) || !validwords.Contains(wordid)) { wordid = -1; } else { if (wordid >= 0) { wordid = crosswalk_words[wordid]; } } //not in idf if (wordid == -1) { firstbigram = -1; } else { if (!freq.allterms[wordid].conceptfrequency.ContainsKey(mappedid)) { freq.allterms[wordid].conceptfrequency.Add(mappedid, 1); } else { freq.allterms[wordid].conceptfrequency[mappedid]++; } if (firstbigram != -1) { if (!freq.allterms[firstbigram].bigrams.ContainsKey(wordid)) { freq.allterms[firstbigram].bigrams.Add(wordid, new Dictionary<int, int>()); } if (!freq.allterms[firstbigram].bigrams[wordid].ContainsKey(mappedid)) { freq.allterms[firstbigram].bigrams[wordid].Add(mappedid, 1); } else { freq.allterms[firstbigram].bigrams[wordid][mappedid]++; } firstbigram = wordid; } else { firstbigram = wordid; } } } if ((DateTimeOffset.Now - currentTime).TotalSeconds > 1) { double done = 1.0 * conceptcounter / validconcepts.Count * 100; Console.Write("{0:F2} percent done\r", done); currentTime = DateTimeOffset.Now; } } Console.WriteLine("100 percent done in {0:F2} minutes.",(DateTimeOffset.Now-startTime).TotalMinutes); //saving data Console.WriteLine("Saving frequency data ..."); startTime = DateTimeOffset.Now; freq.saveData(bigram_threshold); Console.WriteLine("Frequency data saved in {0:F2} minutes ...",(DateTimeOffset.Now - startTime).TotalMinutes); }
static void createFreqResources(int numthreads, WikiData wikidata, HashSet <int> validwords, HashSet <int> validconcepts, Dictionary <int, int> crosswalk_words, Dictionary <int, int> crosswalk_concepts, HashSet <int> stopwords, int bigram_threshold) { //populate termfrequency class termfrequency freq = new termfrequency(); Dictionary <int, string> int2word = new Dictionary <int, string>(); foreach (KeyValuePair <string, int> kvp in wikidata.worddict) { int2word.Add(kvp.Value, kvp.Key); } foreach (int wordid in validwords) { freq.allterms.TryAdd(crosswalk_words[wordid], new termfrequency.singleterm()); freq.allterms[crosswalk_words[wordid]].word = int2word[wordid]; } //loop over all words and bigrams in all concepts DateTimeOffset currentTime = DateTimeOffset.Now; DateTimeOffset startTime = DateTimeOffset.Now; int conceptcounter = 0; Console.WriteLine("Aggregating words and bigrams across concepts:"); foreach (int id in validconcepts) { int mappedid = crosswalk_concepts[id]; conceptcounter++; int firstbigram = -1; int[] stream = wikidata.conceptdata[id].conceptwords; for (int i = 0; i < stream.Length; i++) { int wordid = stream[i]; if (stopwords.Contains(wordid) || !validwords.Contains(wordid)) { wordid = -1; } else { if (wordid >= 0) { wordid = crosswalk_words[wordid]; } } //not in idf if (wordid == -1) { firstbigram = -1; } else { if (!freq.allterms[wordid].conceptfrequency.ContainsKey(mappedid)) { freq.allterms[wordid].conceptfrequency.Add(mappedid, 1); } else { freq.allterms[wordid].conceptfrequency[mappedid]++; } if (firstbigram != -1) { if (!freq.allterms[firstbigram].bigrams.ContainsKey(wordid)) { freq.allterms[firstbigram].bigrams.Add(wordid, new Dictionary <int, int>()); } if (!freq.allterms[firstbigram].bigrams[wordid].ContainsKey(mappedid)) { freq.allterms[firstbigram].bigrams[wordid].Add(mappedid, 1); } else { freq.allterms[firstbigram].bigrams[wordid][mappedid]++; } firstbigram = wordid; } else { firstbigram = wordid; } } } if ((DateTimeOffset.Now - currentTime).TotalSeconds > 1) { double done = 1.0 * conceptcounter / validconcepts.Count * 100; Console.Write("{0:F2} percent done\r", done); currentTime = DateTimeOffset.Now; } } Console.WriteLine("100 percent done in {0:F2} minutes.", (DateTimeOffset.Now - startTime).TotalMinutes); //saving data Console.WriteLine("Saving frequency data ..."); startTime = DateTimeOffset.Now; freq.saveData(bigram_threshold); Console.WriteLine("Frequency data saved in {0:F2} minutes ...", (DateTimeOffset.Now - startTime).TotalMinutes); }