Esempio n. 1
0
        /// <summary>
        /// Analyzes the first <code>nLines</code> of the ascii stream.
        /// </summary>
        /// <param name="importOptions">The import options. This can already contain known values. On return, this instance should be ready to be used to import ascii data, i.e. all fields should contain values unequal to <c>null</c>.</param>
        /// <param name="stream">The ascii stream to analyze.</param>
        /// <param name="analysisOptions">Options that specify how many lines are analyzed, and what number formats and date/time formats will be tested.</param>
        public void InternalAnalyze(AsciiImportOptions importOptions, System.IO.Stream stream, AsciiDocumentAnalysisOptions analysisOptions)
        {
            if (null == stream)
            {
                throw new ArgumentNullException("Stream");
            }
            if (null == analysisOptions)
            {
                throw new ArgumentNullException("analysisOptions");
            }
            if (null == importOptions)
            {
                throw new ArgumentNullException("importOptions");
            }

            // Read-in the lines into _bodyLines. If the number of header lines is already known, those header lines are read into _headerLines
            ReadLinesToAnalyze(stream, analysisOptions.NumberOfLinesToAnalyze, importOptions.NumberOfMainHeaderLines);

            if (_bodyLines.Count == 0)
            {
                return; // there is nothing to analyze
            }
            // Analyze the whitespace structure of the body lines, find out if there is a fixed column width
            _globalStructure = new AsciiGlobalStructureAnalysis(_bodyLines);

            // Sets all separation strategies to test for. If importOptions already contain a separation strategy, only this separation strategy is set
            SetLineAnalysisOptionsToTest(importOptions, analysisOptions);

            // Analyze each of the first few lines with all possible separation strategies
            _lineAnalysisOfBodyLines = new AsciiLineAnalysis[_bodyLines.Count];

            // Do the analysis itself in parallel for each of the lines
            System.Threading.Tasks.Parallel.For(0, _bodyLines.Count, (i) => _lineAnalysisOfBodyLines[i] = new AsciiLineAnalysis(i, _bodyLines[i], _lineAnalysisOptionsToTest));

            // for debugging activate the next line and paste the data into notepad:
            // PutRecognizedStructuresToClipboard(result, separationStrategies);

            EvaluateScoringOfAllLineAnalysisOptions();

            // Evaluate the best separation strategy. Store the value in _highestScoredSeparationStrategy and the corresponding line structure in _highestScoredLineStructure;
            EvaluateHighestScoredLineAnalysisOption();

            // look how many header lines are in the file by comparing the structure of the first lines  with the _highestScoredLineStructure
            if (null == importOptions.NumberOfMainHeaderLines)
            {
                EvaluateNumberOfMainHeaderLines();
            }
            else
            {
                _numberOfMainHeaderLines = importOptions.NumberOfMainHeaderLines.Value;
            }

            // get the index of the caption line
            if (null == importOptions.IndexOfCaptionLine)
            {
                EvaluateIndexOfCaptionLine();
            }
            else
            {
                _indexOfCaptionLine = importOptions.IndexOfCaptionLine.Value;
            }

            importOptions.NumberOfMainHeaderLines = _numberOfMainHeaderLines;
            importOptions.IndexOfCaptionLine      = _indexOfCaptionLine;

            importOptions.SeparationStrategy    = _highestScoredLineAnalysisOption.SeparationStrategy;
            importOptions.NumberFormatCulture   = _highestScoredLineAnalysisOption.NumberFormat;
            importOptions.DateTimeFormatCulture = _highestScoredLineAnalysisOption.DateTimeFormat;

            importOptions.RecognizedStructure = _lineAnalysisOptionsScoring[_highestScoredLineAnalysisOption].LineStructure;
        }
Esempio n. 2
0
		/// <summary>
		/// Analyzes the first <code>nLines</code> of the ascii stream.
		/// </summary>
		/// <param name="importOptions">The import options. This can already contain known values. On return, this instance should be ready to be used to import ascii data, i.e. all fields should contain values unequal to <c>null</c>.</param>
		/// <param name="stream">The ascii stream to analyze.</param>
		/// <param name="analysisOptions">Options that specify how many lines are analyzed, and what number formats and date/time formats will be tested.</param>
		public void InternalAnalyze(AsciiImportOptions importOptions, System.IO.Stream stream, AsciiDocumentAnalysisOptions analysisOptions)
		{
			if (null == stream)
				throw new ArgumentNullException("Stream");
			if (null == analysisOptions)
				throw new ArgumentNullException("analysisOptions");
			if (null == importOptions)
				throw new ArgumentNullException("importOptions");

			// Read-in the lines into _bodyLines. If the number of header lines is already known, those header lines are read into _headerLines
			ReadLinesToAnalyze(stream, analysisOptions.NumberOfLinesToAnalyze, importOptions.NumberOfMainHeaderLines);

			if (_bodyLines.Count == 0)
				return; // there is nothing to analyze

			// Analyze the whitespace structure of the body lines, find out if there is a fixed column width
			_globalStructure = new AsciiGlobalStructureAnalysis(_bodyLines);

			// Sets all separation strategies to test for. If importOptions already contain a separation strategy, only this separation strategy is set
			SetLineAnalysisOptionsToTest(importOptions, analysisOptions);

			// Analyze each of the first few lines with all possible separation strategies
			_lineAnalysisOfBodyLines = new AsciiLineAnalysis[_bodyLines.Count];

			// Do the analysis itself in parallel for each of the lines
			System.Threading.Tasks.Parallel.For(0, _bodyLines.Count, (i) => _lineAnalysisOfBodyLines[i] = new AsciiLineAnalysis(i, _bodyLines[i], _lineAnalysisOptionsToTest));

			// for debugging activate the next line and paste the data into notepad:
			// PutRecognizedStructuresToClipboard(result, separationStrategies);

			EvaluateScoringOfAllLineAnalysisOptions();

			// Evaluate the best separation strategy. Store the value in _highestScoredSeparationStrategy and the corresponding line structure in _highestScoredLineStructure;
			EvaluateHighestScoredLineAnalysisOption();

			// look how many header lines are in the file by comparing the structure of the first lines  with the _highestScoredLineStructure
			if (null == importOptions.NumberOfMainHeaderLines)
				EvaluateNumberOfMainHeaderLines();
			else
				_numberOfMainHeaderLines = importOptions.NumberOfMainHeaderLines.Value;

			// get the index of the caption line
			if (null == importOptions.IndexOfCaptionLine)
				EvaluateIndexOfCaptionLine();
			else
				_indexOfCaptionLine = importOptions.IndexOfCaptionLine.Value;

			importOptions.NumberOfMainHeaderLines = _numberOfMainHeaderLines;
			importOptions.IndexOfCaptionLine = _indexOfCaptionLine;

			importOptions.SeparationStrategy = _highestScoredLineAnalysisOption.SeparationStrategy;
			importOptions.NumberFormatCulture = _highestScoredLineAnalysisOption.NumberFormat;
			importOptions.DateTimeFormatCulture = _highestScoredLineAnalysisOption.DateTimeFormat;

			importOptions.RecognizedStructure = _lineAnalysisOptionsScoring[_highestScoredLineAnalysisOption].LineStructure;
		}