public override void SetConfig(Config config) { base.SetConfig(config); // dirs DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work")); string d = config.Get("docs.dir", "trec"); dataDir = new DirectoryInfo(Path.Combine(workDir.FullName, d)); // files CollectFiles(dataDir, inputFiles); if (inputFiles.Count == 0) { throw new ArgumentException("No files in dataDir: " + dataDir); } // trec doc parser try { string trecDocParserClassName = config.Get("trec.doc.parser", "Lucene.Net.Benchmarks.ByTask.Feeds.TrecGov2Parser, Lucene.Net.Benchmark"); trecDocParser = (TrecDocParser)Activator.CreateInstance(Type.GetType(trecDocParserClassName)); } catch (Exception e) when(e.IsException()) { // Should not get here. Throw runtime exception. throw RuntimeException.Create(e); } // html parser try { string htmlParserClassName = config.Get("html.parser", "Lucene.Net.Benchmarks.ByTask.Feeds.DemoHTMLParser, Lucene.Net.Benchmark"); htmlParser = (IHTMLParser)Activator.CreateInstance(Type.GetType(htmlParserClassName)); } catch (Exception e) { // Should not get here. Throw runtime exception. throw RuntimeException.Create(e); } // encoding if (m_encoding == null) { m_encoding = Encoding.GetEncoding("iso-8859-1"); //StandardCharsets.ISO_8859_1.name(); } // iteration exclusion in doc name excludeDocnameIteration = config.Get("content.source.excludeIteration", false); }
internal virtual void OpenNextFile() { DoClose(); //currPathType = null; while (true) { if (nextFile >= inputFiles.Count) { // exhausted files, start a new round, unless forever set to false. if (!m_forever) { throw new NoMoreDataException(); } nextFile = 0; iteration++; } FileInfo f = inputFiles[nextFile++]; if (m_verbose) { Console.WriteLine("opening: " + f + " length: " + f.Length); } try { Stream inputStream = StreamUtils.GetInputStream(f); // support either gzip, bzip2, or regular text file, by extension reader = new StreamReader(inputStream, m_encoding); currPathType = TrecDocParser.PathType(f); return; } catch (Exception e) when(e.IsException()) { if (m_verbose) { Console.WriteLine("Skipping 'bad' file " + f.FullName + " due to " + e.Message); continue; } throw new NoMoreDataException(); } } }