/// <summary> /// Two argument constructor for TwoPassDataIndexer. /// </summary> /// <param name="eventReader"> /// An ITrainingEventReader which contains the a list of all the events /// seen in the training data. /// </param> /// <param name="cutoff"> /// The minimum number of times a predicate must have been /// observed in order to be included in the model. /// </param> public TwoPassDataIndexer(ITrainingEventReader eventReader, int cutoff) { List <ComparableEvent> eventsToCompare; var predicateIndex = new Dictionary <string, int>(); //NotifyProgress("Indexing events using cutoff of " + cutoff + "\n"); //NotifyProgress("\tComputing event counts... "); string tempFile = new FileInfo(Path.GetTempFileName()).FullName; int eventCount = ComputeEventCounts(eventReader, tempFile, predicateIndex, cutoff); //NotifyProgress("done. " + eventCount + " events"); //NotifyProgress("\tIndexing... "); using (var fileEventReader = new FileEventReader(tempFile)) { eventsToCompare = Index(eventCount, fileEventReader, predicateIndex); } if (File.Exists(tempFile)) { File.Delete(tempFile); } //NotifyProgress("done."); //NotifyProgress("Sorting and merging events... "); SortAndMerge(eventsToCompare); //NotifyProgress("Done indexing."); }
/// <summary> /// Reads events from <tt>eventStream</tt> into a dictionary. The /// predicates associated with each event are counted and any which /// occur at least <tt>cutoff</tt> times are added to the /// <tt>predicatesInOut</tt> map along with a unique integer index. /// </summary> /// <param name="eventReader"> /// an <code>ITrainingEventReader</code> value /// </param> /// <param name="eventStoreFile"> /// a file name to which the events are written to for later processing. /// </param> /// <param name="predicatesInOut"> /// a <code>Dictionary</code> value /// </param> /// <param name="cutoff"> /// an <code>int</code> value /// </param> private int ComputeEventCounts(ITrainingEventReader eventReader, string eventStoreFile, Dictionary <string, int> predicatesInOut, int cutoff) { var counter = new Dictionary <string, int>(); int predicateIndex = 0; int eventCount = 0; using (var eventStoreWriter = new StreamWriter(eventStoreFile)) { while (eventReader.HasNext()) { TrainingEvent currentTrainingEvent = eventReader.ReadNextEvent(); eventCount++; eventStoreWriter.Write(FileEventReader.ToLine(currentTrainingEvent)); string[] eventContext = currentTrainingEvent.Context; for (int currentPredicate = 0; currentPredicate < eventContext.Length; currentPredicate++) { if (!predicatesInOut.ContainsKey(eventContext[currentPredicate])) { if (counter.ContainsKey(eventContext[currentPredicate])) { counter[eventContext[currentPredicate]]++; } else { counter.Add(eventContext[currentPredicate], 1); } if (counter[eventContext[currentPredicate]] >= cutoff) { predicatesInOut.Add(eventContext[currentPredicate], predicateIndex++); counter.Remove(eventContext[currentPredicate]); } } } } } return(eventCount); }