static void Display (Indexable indexable) { if (!first_indexable) { Console.WriteLine (); Console.WriteLine ("-----------------------------------------"); Console.WriteLine (); } first_indexable = false; Console.WriteLine ("Filename: " + indexable.Uri); if (indexable.ParentUri != null) Console.WriteLine ("Parent: " + indexable.ParentUri); Stopwatch watch = new Stopwatch (); Filter filter; watch.Start (); if (! FilterFactory.FilterIndexable (indexable, out filter)) { indexable.Cleanup (); indexable.NoContent = true; filter = null; } watch.Stop (); Console.WriteLine ("Filter: {0} (determined in {1})", filter, watch); Console.WriteLine ("MimeType: {0}", indexable.MimeType); Console.WriteLine (); ArrayList generated_indexables = new ArrayList (); Indexable generated_indexable; bool first = true; if (filter != null && filter.HasGeneratedIndexable) { while (filter.GenerateNextIndexable (out generated_indexable)) { if (generated_indexable == null) continue; if (first) { Console.WriteLine ("Filter-generated indexables:"); first = false; } Console.WriteLine (" {0}", generated_indexable.Uri); if (show_generated) generated_indexables.Add (generated_indexable); else generated_indexable.Cleanup (); } } if (! first) Console.WriteLine (); // Make sure that the properties are sorted. ArrayList prop_array = new ArrayList (indexable.Properties); prop_array.Sort (); Console.WriteLine ("Properties:"); if (indexable.ValidTimestamp) Console.WriteLine (" Timestamp = {0}", DateTimeUtil.ToString (indexable.Timestamp)); foreach (Beagrep.Property prop in prop_array) { if (String.IsNullOrEmpty (prop.Value)) continue; Console.WriteLine (" {0} = {1}", prop.Key, prop.Value); } Console.WriteLine (); if (indexable.NoContent) return; watch.Reset (); watch.Start (); TextReader reader; Analyzer indexing_analyzer = new BeagrepAnalyzer (); char[] buffer = new char [2048]; reader = indexable.GetTextReader (); char separater_char = (tokenize ? '\n' : ' '); if (reader != null) { first = true; if (analyze) { if (! stats_only) Console.WriteLine ("Content:"); TokenStream token_stream = indexing_analyzer.TokenStream ("Text", reader); Lucene.Net.Analysis.Token token = token_stream.Next (); first = (token == null); if (! stats_only) for (; token != null; token = token_stream.Next ()) Console.Write ("{0}{1}", token.TermText (), separater_char); token_stream.Close (); } else { #if false while (true) { int l = reader.Read (buffer, 0, 2048); if (l <= 0) break; if (first) first = false; if (! stats_only) DisplayContent (buffer, l); } #else string line; first = true; while ((line = reader.ReadLine ()) != null) { if (first) { Console.WriteLine ("Content:"); first = false; } if (! stats_only) DisplayContent (line); } #endif } reader.Close (); if (first) Console.WriteLine ("(no content)"); else Console.WriteLine ('\n'); } /* reader = indexable.GetHotTextReader (); first = true; if (reader != null) { Console.WriteLine ("HotContent:"); if (analyze) { TokenStream token_stream = indexing_analyzer.TokenStream ("HotText", reader); Lucene.Net.Analysis.Token token = token_stream.Next (); first = (token == null); for (; token != null; token = token_stream.Next ()) Console.Write ("{0}{1}", token.TermText (), separater_char); token_stream.Close (); } else { while (true) { int l = reader.Read (buffer, 0, 2048); if (l <= 0) break; if (first) first = false; DisplayContent (buffer, l); } } reader.Close (); if (first) Console.WriteLine ("(no hot content)"); else Console.WriteLine ('\n'); } */ watch.Stop (); Console.WriteLine (); Console.WriteLine ("Text extracted in {0}", watch); foreach (Indexable gi in generated_indexables) Display (gi); Stream stream = indexable.GetBinaryStream (); if (stream != null) stream.Close (); // Clean up any temporary files associated with filtering this indexable. indexable.Cleanup (); }
static void Display(Indexable indexable) { if (!first_indexable) { Console.WriteLine(); Console.WriteLine("-----------------------------------------"); Console.WriteLine(); } first_indexable = false; Console.WriteLine("Filename: " + indexable.Uri); if (indexable.ParentUri != null) { Console.WriteLine("Parent: " + indexable.ParentUri); } Stopwatch watch = new Stopwatch(); Filter filter; watch.Start(); if (!FilterFactory.FilterIndexable(indexable, out filter)) { indexable.Cleanup(); indexable.NoContent = true; filter = null; } watch.Stop(); Console.WriteLine("Filter: {0} (determined in {1})", filter, watch); Console.WriteLine("MimeType: {0}", indexable.MimeType); Console.WriteLine(); ArrayList generated_indexables = new ArrayList(); Indexable generated_indexable; bool first = true; if (filter != null && filter.HasGeneratedIndexable) { while (filter.GenerateNextIndexable(out generated_indexable)) { if (generated_indexable == null) { continue; } if (first) { Console.WriteLine("Filter-generated indexables:"); first = false; } Console.WriteLine(" {0}", generated_indexable.Uri); if (show_generated) { generated_indexables.Add(generated_indexable); } else { generated_indexable.Cleanup(); } } } if (!first) { Console.WriteLine(); } // Make sure that the properties are sorted. ArrayList prop_array = new ArrayList(indexable.Properties); prop_array.Sort(); Console.WriteLine("Properties:"); if (indexable.ValidTimestamp) { Console.WriteLine(" Timestamp = {0}", DateTimeUtil.ToString(indexable.Timestamp)); } foreach (Beagrep.Property prop in prop_array) { if (String.IsNullOrEmpty(prop.Value)) { continue; } Console.WriteLine(" {0} = {1}", prop.Key, prop.Value); } Console.WriteLine(); if (indexable.NoContent) { return; } watch.Reset(); watch.Start(); TextReader reader; Analyzer indexing_analyzer = new BeagrepAnalyzer(); char[] buffer = new char [2048]; reader = indexable.GetTextReader(); char separater_char = (tokenize ? '\n' : ' '); if (reader != null) { first = true; if (analyze) { if (!stats_only) { Console.WriteLine("Content:"); } TokenStream token_stream = indexing_analyzer.TokenStream("Text", reader); Lucene.Net.Analysis.Token token = token_stream.Next(); first = (token == null); if (!stats_only) { for (; token != null; token = token_stream.Next()) { Console.Write("{0}{1}", token.TermText(), separater_char); } } token_stream.Close(); } else { #if false while (true) { int l = reader.Read(buffer, 0, 2048); if (l <= 0) { break; } if (first) { first = false; } if (!stats_only) { DisplayContent(buffer, l); } } #else string line; first = true; while ((line = reader.ReadLine()) != null) { if (first) { Console.WriteLine("Content:"); first = false; } if (!stats_only) { DisplayContent(line); } } #endif } reader.Close(); if (first) { Console.WriteLine("(no content)"); } else { Console.WriteLine('\n'); } } /* * reader = indexable.GetHotTextReader (); * first = true; * if (reader != null) { * Console.WriteLine ("HotContent:"); * * if (analyze) { * TokenStream token_stream = indexing_analyzer.TokenStream ("HotText", reader); * Lucene.Net.Analysis.Token token = token_stream.Next (); * first = (token == null); * * for (; token != null; token = token_stream.Next ()) * Console.Write ("{0}{1}", token.TermText (), separater_char); * * token_stream.Close (); * } else { * while (true) { * int l = reader.Read (buffer, 0, 2048); * if (l <= 0) * break; * if (first) * first = false; * DisplayContent (buffer, l); * } * } * * reader.Close (); * * if (first) * Console.WriteLine ("(no hot content)"); * else * Console.WriteLine ('\n'); * } */ watch.Stop(); Console.WriteLine(); Console.WriteLine("Text extracted in {0}", watch); foreach (Indexable gi in generated_indexables) { Display(gi); } Stream stream = indexable.GetBinaryStream(); if (stream != null) { stream.Close(); } // Clean up any temporary files associated with filtering this indexable. indexable.Cleanup(); }