/// <summary> /// Extracts the typed queries from Sogou 2012 /// </summary> /// <param name="args"></param> public static void MainSogou2012(string[] args) { HashSet<string> lookupPastQueries = new HashSet<string>(); Journal journal = new Journal(); StreamWriter fsOutput = new StreamWriter(new FileStream(@"D:\Query log data\SogouQ.2012.full\Sogou2012-queries.txt", FileMode.CreateNew), Encoding.UTF8); string[] files = new string[] { @"D:\Query log data\SogouQ.2012.full\querylog.txt" }; foreach (string file in files) { using (StreamReader sr = new StreamReader(file, Encoding.GetEncoding(936))) { String line = sr.ReadLine(); int counter = 0; while (line != null) { counter++; if (counter % 10000 == 0) Console.WriteLine("Reached: " + counter.ToString()); string[] row = line.Split('\t'); DateTime queryTime = DateTime.ParseExact(row[0], "yyyyMMddHHmmss", CultureInfo.InvariantCulture); string lookupKey = row[1] + row[2]; // Ensure query doesn't already exist in the journal if (!lookupPastQueries.Contains(lookupKey)) { fsOutput.WriteLine(row[2].Replace("[", "").Replace("]", "") + '\t' + queryTime.ToString("yyyy-MM-dd HH:mm:ss")); // Add to journal JournalEntry je = new JournalEntry(); je.EntryDateTime = queryTime; je.Query = lookupKey; // Set query as lookup query journal.AddEntry(je); } // Remove old journal entries foreach (JournalEntry je in journal.GetEntriesBeforeDateAndDelete(queryTime.AddMinutes(-30))) { lookupPastQueries.Remove(je.Query); } line = sr.ReadLine(); } } } }
/// <summary> /// Add an entry to the journal /// </summary> /// <param name="entry"></param> public void AddEntry(JournalEntry entry) { if (_oldestEntry == null) { // No existing entries _oldestEntry = entry; _newestEntry = entry; } else { _newestEntry.NextEntry = entry; // Append to existing entries _newestEntry = entry; } _entryCount++; }
/// <summary> /// Iterator for entries before the provided date. After the entry has been returned it will be removed from the /// journal. (THIS METHOD IS *NOT* THREAD-SAFE). /// </summary> /// <param name="dateTime"></param> /// <returns></returns> public IEnumerable<JournalEntry> GetEntriesBeforeDateAndDelete(DateTime beforeDateTime) { List<JournalEntry> toReturnList = new List<JournalEntry>(); while (_oldestEntry != null && _oldestEntry.EntryDateTime < beforeDateTime) { // Now remove entry from the journal toReturnList.Add(_oldestEntry); _oldestEntry = _oldestEntry.NextEntry; _entryCount--; } return toReturnList; }
/// <summary> /// Extracts the typed queries from Sogou 2008 /// </summary> /// <param name="args"></param> public static void MainSogou2008(string[] args) { Dictionary<string, int> lookupPastQueries = new Dictionary<string, int>(); Journal journal = new Journal(); StreamWriter fsOutput = new StreamWriter(new FileStream(@"D:\QueryLogs\SogouQ.2008\sogououtput.txt", FileMode.Create), Encoding.UTF8); int badCount = 0; foreach (string file in Directory.GetFiles(@"D:\QueryLogs\SogouQ.2008\", "*.filter")) { FileInfo fi = new FileInfo(file); string[] fileParts = fi.Name.Split('.'); string datePart = fileParts[1].Substring(0, 4) + "-" + fileParts[1].Substring(4, 2) + "-" + fileParts[1].Substring(6, 2); using (StreamReader sr = new StreamReader(file, Encoding.GetEncoding(936))) { String line = sr.ReadLine(); while (line != null) { string[] row = line.Split('\t'); DateTime queryTime = DateTime.Parse(datePart + ' ' + row[0]); if (row[2].Replace("[", "").StartsWith("http") || row[2].Replace("[", "").StartsWith("www.")) { line = sr.ReadLine(); continue; } string lookupKey = row[1] + row[2]; int pastCount = 0; lookupPastQueries.TryGetValue(lookupKey, out pastCount); // Ensure query doesn't already exist in the journal if (pastCount == 0) { fsOutput.WriteLine(row[2].Replace("[", "").Replace("]", "") + '\t' + queryTime.ToString("yyyy-MM-dd HH:mm:ss")); } else { badCount++; } // Add to journal JournalEntry je = new JournalEntry(); je.EntryDateTime = queryTime; je.Query = lookupKey; // Set query as lookup query journal.AddEntry(je); // Increment the counter for the query if (lookupPastQueries.ContainsKey(lookupKey)) lookupPastQueries[lookupKey] += 1; else lookupPastQueries[lookupKey] = 1; // Remove old journal entries foreach (JournalEntry je2 in journal.GetEntriesBeforeDateAndDelete(queryTime.AddMinutes(-30))) { lookupPastQueries[je2.Query] -= 1; //if (lookupPastQueries[je2.Query] == 0) //lookupPastQueries.Remove(je2.Query); } line = sr.ReadLine(); } } } fsOutput.Close(); }