Exemplo n.º 1
0
        public override void SetConfig(Config config)
        {
            base.SetConfig(config);
            // dirs
            DirectoryInfo workDir = new DirectoryInfo(config.Get("work.dir", "work"));
            string        d       = config.Get("docs.dir", "trec");

            dataDir = new DirectoryInfo(Path.Combine(workDir.FullName, d));
            // files
            CollectFiles(dataDir, inputFiles);
            if (inputFiles.Count == 0)
            {
                throw new ArgumentException("No files in dataDir: " + dataDir);
            }
            // trec doc parser
            try
            {
                string trecDocParserClassName = config.Get("trec.doc.parser", "Lucene.Net.Benchmarks.ByTask.Feeds.TrecGov2Parser, Lucene.Net.Benchmark");
                trecDocParser = (TrecDocParser)Activator.CreateInstance(Type.GetType(trecDocParserClassName));
            }
            catch (Exception e) when(e.IsException())
            {
                // Should not get here. Throw runtime exception.
                throw RuntimeException.Create(e);
            }
            // html parser
            try
            {
                string htmlParserClassName = config.Get("html.parser",
                                                        "Lucene.Net.Benchmarks.ByTask.Feeds.DemoHTMLParser, Lucene.Net.Benchmark");
                htmlParser = (IHTMLParser)Activator.CreateInstance(Type.GetType(htmlParserClassName));
            }
            catch (Exception e)
            {
                // Should not get here. Throw runtime exception.
                throw RuntimeException.Create(e);
            }
            // encoding
            if (m_encoding == null)
            {
                m_encoding = Encoding.GetEncoding("iso-8859-1"); //StandardCharsets.ISO_8859_1.name();
            }
            // iteration exclusion in doc name
            excludeDocnameIteration = config.Get("content.source.excludeIteration", false);
        }
Exemplo n.º 2
0
 internal virtual void OpenNextFile()
 {
     DoClose();
     //currPathType = null;
     while (true)
     {
         if (nextFile >= inputFiles.Count)
         {
             // exhausted files, start a new round, unless forever set to false.
             if (!m_forever)
             {
                 throw new NoMoreDataException();
             }
             nextFile = 0;
             iteration++;
         }
         FileInfo f = inputFiles[nextFile++];
         if (m_verbose)
         {
             Console.WriteLine("opening: " + f + " length: " + f.Length);
         }
         try
         {
             Stream inputStream = StreamUtils.GetInputStream(f); // support either gzip, bzip2, or regular text file, by extension
             reader       = new StreamReader(inputStream, m_encoding);
             currPathType = TrecDocParser.PathType(f);
             return;
         }
         catch (Exception e) when(e.IsException())
         {
             if (m_verbose)
             {
                 Console.WriteLine("Skipping 'bad' file " + f.FullName + " due to " + e.Message);
                 continue;
             }
             throw new NoMoreDataException();
         }
     }
 }