/// <summary> /// Add an entry to the journal /// </summary> /// <param name="entry"></param> public void AddEntry(JournalEntry entry) { if (_oldestEntry == null) { // No existing entries _oldestEntry = entry; _newestEntry = entry; } else { _newestEntry.NextEntry = entry; // Append to existing entries _newestEntry = entry; } _entryCount++; }
/// <summary> /// Iterator for entries before the provided date. After the entry has been returned it will be removed from the /// journal. (THIS METHOD IS *NOT* THREAD-SAFE). /// </summary> /// <param name="dateTime"></param> /// <returns></returns> public IEnumerable<JournalEntry> GetEntriesBeforeDateAndDelete(DateTime beforeDateTime) { List<JournalEntry> toReturnList = new List<JournalEntry>(); while (_oldestEntry != null && _oldestEntry.EntryDateTime < beforeDateTime) { // Now remove entry from the journal toReturnList.Add(_oldestEntry); _oldestEntry = _oldestEntry.NextEntry; _entryCount--; } return toReturnList; }
static void Main(string[] args) { Dictionary<string, int> lookupPastQueries = new Dictionary<string, int>(); Journal journal = new Journal(); StreamWriter fsOutput = new StreamWriter(new FileStream(@"D:\aol-queries-new.txt", FileMode.Create), Encoding.UTF8); string[] files = new string[] { @"C:\hadoop-cdh4.0\aol-filtered.txt" }; foreach (string file in files) { using (StreamReader sr = new StreamReader(file)) { String line = sr.ReadLine(); int counter = 0; while (line != null) { counter++; if (counter % 10000 == 0) Console.WriteLine("Reached: " + counter.ToString()); string[] row = line.Split('\t'); string anonId = row[0]; if (anonId == "AnonID") { line = sr.ReadLine(); continue; } string query = row[2].ToLower(); if (query.StartsWith("http") || query.StartsWith("www.")) { line = sr.ReadLine(); continue; } string queryTime = row[1]; DateTime queryTimeDT = DateTime.Parse(queryTime); string lookupKey = anonId + query; // Lookup int pastCount = 0; lookupPastQueries.TryGetValue(lookupKey, out pastCount); // Ensure query doesn't already exist in the journal if (pastCount == 0) { if (query != "-") fsOutput.WriteLine(query + '\t' + queryTime); } // Add to journal JournalEntry je = new JournalEntry(); je.EntryDateTime = queryTimeDT; je.Query = lookupKey; // Set query as lookup query journal.AddEntry(je); // Increment the counter for the query if (lookupPastQueries.ContainsKey(lookupKey)) lookupPastQueries[lookupKey] += 1; else lookupPastQueries[lookupKey] = 1; // Remove old journal entries foreach (JournalEntry je2 in journal.GetEntriesBeforeDateAndDelete(queryTimeDT.AddMinutes(-30))) { lookupPastQueries[je2.Query] -= 1; } line = sr.ReadLine(); } fsOutput.Close(); } } }