Пример #1
0
        static void Main(string[] args)
        {
            //Te ścieżki trzeba zmienić
            string pathToArticles = @"..\..\..\..\TestData\eksTRAKTOR_output\pagesTexts";
            string pathToIndex = @"..\..\..\..\TestData\index";

            //utworzenie całego indeksu - łącznie z pozycjami tokenow
            Console.WriteLine("Started {0}", DateTime.Now);
            IxCreator creator = new IxCreator(
                new IxBufferedCrawler(new IxScatteredStorageCrawler(pathToArticles)),
                new IxStdRIIndexer(pathToIndex, 12),
                pathToIndex);
            creator.DoIndexing();
            Console.WriteLine("Finished {0}", DateTime.Now);

            /*
            //utworzenie plików z pozycjami (TYLKO DO CELÓW DEBUGOWYCH)
            IxStdDiskFwdIndex fwdIndex = new IxStdDiskFwdIndex(pathToIndex + @"\fwdIndex", IxIndexMode.READ);
            IxTokenPositionsIndex index = new IxTokenPositionsIndex(pathToIndex);
            index.Create(fwdIndex);
             */
        }
Пример #2
0
        /// <summary>
        /// Uruchamia ekstrakcję treści stron znalezionych w dokumencie XML do plików tekstowych
        /// </summary>
        /// 
        /// <param name="inputXMLPath">Ścieżka do dokumentu XML zrzutu artykułów wiki (pages-articles.xml)</param>
        /// <param name="outputDirPath">Ścieżka do istniejącego katalogu, do którego mają być ekstraktowane teksty artykułów</param>
        public void run(String inputXMLPath, String outputDirPath)
        {
            if (advisorThread.IsBusy)
                throw new eTRAlreadyWorkingException();

            DirectoryInfo di = new DirectoryInfo(outputDirPath);
            if (!di.Exists)
                throw new eTROutputDirDontExistsException();

            if (di.GetFiles().Length != 0 || di.GetDirectories().Length != 0)
                throw new eTROutputDirNotEmptyException();

            this.inputXMLPath = inputXMLPath;

            outputDirPath.TrimEnd(new char[] { '\\' });

            extractedPages = 0;

            cleaner = new eTRCleaner(messageCollector);

            if (eTRSettings.saveAsFiles || eTRSettings.saveAsFilesOriginalPageText)
                distributor = new eTRDistributor(outputDirPath + "\\pagesTexts", messageCollector);

            if (eTRSettings.createCollectiveFile)
                collector = new eTRCollector(outputDirPath + "\\collectedArticles.dat");

            if (eTRSettings.createIndyx)
            {
                indyxDocs = new IxManualServedCrawler();
                indyxIndexer = new IxStdRIIndexer(outputDirPath + "\\index", eTRSettings.indyxThreadCount);
                indyxCreator = new IxCreator(indyxDocs, indyxIndexer, outputDirPath + "\\index");
                indyxThread = new Thread(new ThreadStart(indyxCreator.DoIndexing));
                indyxThread.Start();
            }

            ThreadPool.SetMaxThreads(eTRSettings.createdWorkerThreadsCount, eTRSettings.completionPortThreadsCount);

            pageNodesBuffer = new HashSet<Match>();
            inputXMLBuffer = new StringBuilder(eTRSettings.bufferStartCapacity);

            works = new Match[eTRSettings.createdWorkerThreadsCount];
            workFinishedEvents = new ManualResetEvent[eTRSettings.createdWorkerThreadsCount];
            worksLocks = new Object[eTRSettings.createdWorkerThreadsCount];
            for (int i = 0; i < eTRSettings.createdWorkerThreadsCount; i++)
                worksLocks[i] = new Object();

            advisorThread.RunWorkerAsync();
        }