/// <summary> /// Trains and writes a model based on the events in the specified event file. /// the name of the model created is based on the event file name. </summary> /// <param name="args"> eventfile [iterations cuttoff] </param> /// <exception cref="IOException"> when the eventfile can not be read or the model file can not be written. </exception> public static void Main(string[] args) { if (args.Length == 0) { Console.Error.WriteLine("Usage: FileEventStream eventfile [iterations cutoff]"); Environment.Exit(1); } int ai = 0; string eventFile = args[ai++]; int iterations = 100; int cutoff = 5; if (ai < args.Length) { iterations = Convert.ToInt32(args[ai++]); cutoff = Convert.ToInt32(args[ai++]); } AbstractModel model; FileEventStream es = new FileEventStream(eventFile); try { model = GIS.trainModel(es, iterations, cutoff); } finally { es.close(); } (new SuffixSensitiveGISModelWriter(model, new Jfile(eventFile + ".bin.gz"))).persist(); }
/// <summary> /// Reads events from <tt>eventStream</tt> into a linked list. The /// predicates associated with each event are counted and any which /// occur at least <tt>cutoff</tt> times are added to the /// <tt>predicatesInOut</tt> map along with a unique integer index. /// </summary> /// <param name="eventStream"> an <code>EventStream</code> value </param> /// <param name="eventStore"> a writer to which the events are written to for later processing. </param> /// <param name="predicatesInOut"> a <code>TObjectIntHashMap</code> value </param> /// <param name="cutoff"> an <code>int</code> value </param> private int computeEventCounts(EventStream eventStream, Writer eventStore, IDictionary <string, int?> predicatesInOut, int cutoff) { IDictionary <string, int?> counter = new Dictionary <string, int?>(); int eventCount = 0; HashSet <string> predicateSet = new HashSet <string>(); while (eventStream.hasNext()) { Event ev = eventStream.next(); eventCount++; eventStore.write(FileEventStream.toLine(ev)); string[] ec = ev.Context; update(ec, predicateSet, counter, cutoff); } predCounts = new int[predicateSet.Count]; int index = 0; for (IEnumerator <string> pi = predicateSet.GetEnumerator(); pi.MoveNext(); index++) { string predicate = pi.Current; predCounts[index] = counter[predicate].GetValueOrDefault(); predicatesInOut[predicate] = index; } eventStore.close(); return(eventCount); }
/// <summary> /// Two argument constructor for DataIndexer. /// </summary> /// <param name="eventStream"> An Event[] which contains the a list of all the Events /// seen in the training data. </param> /// <param name="cutoff"> The minimum number of times a predicate must have been /// observed in order to be included in the model. </param> public TwoPassDataIndexer(EventStream eventStream, int cutoff, bool sort) { IDictionary <string, int?> predicateIndex = new Dictionary <string, int?>(); List <ComparableEvent> eventsToCompare; Console.WriteLine("Indexing events using cutoff of " + cutoff + "\n"); Console.Write("\tComputing event counts... "); try { Jfile tmp = Jfile.createTempFile("events", null); tmp.deleteOnExit(); Writer osw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmp), "UTF8")); int numEvents = computeEventCounts(eventStream, osw, predicateIndex, cutoff); Console.WriteLine("done. " + numEvents + " events"); Console.Write("\tIndexing... "); FileEventStream fes = new FileEventStream(tmp); try { eventsToCompare = index(numEvents, fes, predicateIndex); } finally { fes.close(); } // done with predicates predicateIndex = null; tmp.delete(); Console.WriteLine("done."); if (sort) { Console.Write("Sorting and merging events... "); } else { Console.Write("Collecting events... "); } sortAndMerge(eventsToCompare, sort); Console.WriteLine("Done indexing."); } catch (IOException e) { Console.Error.WriteLine(e); } }