예제 #1
0
        /// <summary>
        /// Determines, which lines are the most fr
        /// </summary>
        /// <param name="analysisOption"></param>
        /// <param name="result"></param>
        /// <param name="excludeLineStructureHashes"></param>
        /// <param name="maxNumberOfEqualLines"></param>
        /// <param name="bestLine"></param>
        public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList <AsciiLineAnalysis> result, HashSet <int> excludeLineStructureHashes, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine)
        {
            // Dictionary, Key is the hash of the line structure hash, Value is the number of lines that have this hash
            var numberOfLinesForLineStructureHash = new Dictionary <int, int>();

            bestLine = null;
            for (int i = 0; i < result.Count; i++)
            {
                AsciiLineAnalysis lineResults = result[i];
                int lineStructureHash         = lineResults[analysisOption].GetHashCode(); // and hash code
                if (numberOfLinesForLineStructureHash.ContainsKey(lineStructureHash))
                {
                    numberOfLinesForLineStructureHash[lineStructureHash] = 1 + numberOfLinesForLineStructureHash[lineStructureHash];
                }
                else
                {
                    numberOfLinesForLineStructureHash.Add(lineStructureHash, 1);
                }
            }

            // determine, which of the line structures is the most frequent one
            maxNumberOfEqualLines = 0;
            int hashOfMostFrequentStructure = 0;

            foreach (var dictEntry in numberOfLinesForLineStructureHash)
            {
                int lineStructureHash = dictEntry.Key;

                if (null != excludeLineStructureHashes && excludeLineStructureHashes.Contains(lineStructureHash))
                {
                    continue;
                }

                int numberOfLines = dictEntry.Value;
                if (maxNumberOfEqualLines < numberOfLines)
                {
                    maxNumberOfEqualLines       = numberOfLines;
                    hashOfMostFrequentStructure = lineStructureHash;
                }
            } // for each

            // search for the maximum priority of those lines with the most frequent structure
            int maxPriorityOfMostFrequentLines = 0;

            for (int i = 0; i < result.Count; i++)
            {
                AsciiLineAnalysis lineResults = result[i];
                if (hashOfMostFrequentStructure == lineResults[analysisOption].GetHashCode())
                {
                    int prty = lineResults[analysisOption].LineStructureScoring;
                    if (prty >= maxPriorityOfMostFrequentLines)
                    {
                        maxPriorityOfMostFrequentLines = prty;
                        bestLine = lineResults[analysisOption];
                    }
                } // if
            }     // for

            // if the bestLine is a line with a column count of zero, we should use the next best line
            // we achieve this by adding the best hash to a list of excluded hashes and call the function again
            if (bestLine != null && bestLine.Count == 0)
            {
                if (null != excludeLineStructureHashes && !excludeLineStructureHashes.Contains(hashOfMostFrequentStructure))
                {
                    excludeLineStructureHashes.Add(hashOfMostFrequentStructure);
                    CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine);
                    return;
                }
                else if (null == excludeLineStructureHashes)
                {
                    excludeLineStructureHashes = new HashSet <int>()
                    {
                        hashOfMostFrequentStructure
                    };
                    CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine);
                    return;
                }
            }
        }
예제 #2
0
        /// <summary>
        /// Analyzes the first <code>nLines</code> of the ascii stream.
        /// </summary>
        /// <param name="importOptions">The import options. This can already contain known values. On return, this instance should be ready to be used to import ascii data, i.e. all fields should contain values unequal to <c>null</c>.</param>
        /// <param name="stream">The ascii stream to analyze.</param>
        /// <param name="analysisOptions">Options that specify how many lines are analyzed, and what number formats and date/time formats will be tested.</param>
        public void InternalAnalyze(AsciiImportOptions importOptions, System.IO.Stream stream, AsciiDocumentAnalysisOptions analysisOptions)
        {
            if (null == stream)
            {
                throw new ArgumentNullException("Stream");
            }
            if (null == analysisOptions)
            {
                throw new ArgumentNullException("analysisOptions");
            }
            if (null == importOptions)
            {
                throw new ArgumentNullException("importOptions");
            }

            // Read-in the lines into _bodyLines. If the number of header lines is already known, those header lines are read into _headerLines
            ReadLinesToAnalyze(stream, analysisOptions.NumberOfLinesToAnalyze, importOptions.NumberOfMainHeaderLines);

            if (_bodyLines.Count == 0)
            {
                return; // there is nothing to analyze
            }
            // Analyze the whitespace structure of the body lines, find out if there is a fixed column width
            _globalStructure = new AsciiGlobalStructureAnalysis(_bodyLines);

            // Sets all separation strategies to test for. If importOptions already contain a separation strategy, only this separation strategy is set
            SetLineAnalysisOptionsToTest(importOptions, analysisOptions);

            // Analyze each of the first few lines with all possible separation strategies
            _lineAnalysisOfBodyLines = new AsciiLineAnalysis[_bodyLines.Count];

            // Do the analysis itself in parallel for each of the lines
            System.Threading.Tasks.Parallel.For(0, _bodyLines.Count, (i) => _lineAnalysisOfBodyLines[i] = new AsciiLineAnalysis(i, _bodyLines[i], _lineAnalysisOptionsToTest));

            // for debugging activate the next line and paste the data into notepad:
            // PutRecognizedStructuresToClipboard(result, separationStrategies);

            EvaluateScoringOfAllLineAnalysisOptions();

            // Evaluate the best separation strategy. Store the value in _highestScoredSeparationStrategy and the corresponding line structure in _highestScoredLineStructure;
            EvaluateHighestScoredLineAnalysisOption();

            // look how many header lines are in the file by comparing the structure of the first lines  with the _highestScoredLineStructure
            if (null == importOptions.NumberOfMainHeaderLines)
            {
                EvaluateNumberOfMainHeaderLines();
            }
            else
            {
                _numberOfMainHeaderLines = importOptions.NumberOfMainHeaderLines.Value;
            }

            // get the index of the caption line
            if (null == importOptions.IndexOfCaptionLine)
            {
                EvaluateIndexOfCaptionLine();
            }
            else
            {
                _indexOfCaptionLine = importOptions.IndexOfCaptionLine.Value;
            }

            importOptions.NumberOfMainHeaderLines = _numberOfMainHeaderLines;
            importOptions.IndexOfCaptionLine      = _indexOfCaptionLine;

            importOptions.SeparationStrategy    = _highestScoredLineAnalysisOption.SeparationStrategy;
            importOptions.NumberFormatCulture   = _highestScoredLineAnalysisOption.NumberFormat;
            importOptions.DateTimeFormatCulture = _highestScoredLineAnalysisOption.DateTimeFormat;

            importOptions.RecognizedStructure = _lineAnalysisOptionsScoring[_highestScoredLineAnalysisOption].LineStructure;
        }
예제 #3
0
		/// <summary>
		/// Analyzes the first <code>nLines</code> of the ascii stream.
		/// </summary>
		/// <param name="importOptions">The import options. This can already contain known values. On return, this instance should be ready to be used to import ascii data, i.e. all fields should contain values unequal to <c>null</c>.</param>
		/// <param name="stream">The ascii stream to analyze.</param>
		/// <param name="analysisOptions">Options that specify how many lines are analyzed, and what number formats and date/time formats will be tested.</param>
		public void InternalAnalyze(AsciiImportOptions importOptions, System.IO.Stream stream, AsciiDocumentAnalysisOptions analysisOptions)
		{
			if (null == stream)
				throw new ArgumentNullException("Stream");
			if (null == analysisOptions)
				throw new ArgumentNullException("analysisOptions");
			if (null == importOptions)
				throw new ArgumentNullException("importOptions");

			// Read-in the lines into _bodyLines. If the number of header lines is already known, those header lines are read into _headerLines
			ReadLinesToAnalyze(stream, analysisOptions.NumberOfLinesToAnalyze, importOptions.NumberOfMainHeaderLines);

			if (_bodyLines.Count == 0)
				return; // there is nothing to analyze

			// Analyze the whitespace structure of the body lines, find out if there is a fixed column width
			_globalStructure = new AsciiGlobalStructureAnalysis(_bodyLines);

			// Sets all separation strategies to test for. If importOptions already contain a separation strategy, only this separation strategy is set
			SetLineAnalysisOptionsToTest(importOptions, analysisOptions);

			// Analyze each of the first few lines with all possible separation strategies
			_lineAnalysisOfBodyLines = new AsciiLineAnalysis[_bodyLines.Count];

			// Do the analysis itself in parallel for each of the lines
			System.Threading.Tasks.Parallel.For(0, _bodyLines.Count, (i) => _lineAnalysisOfBodyLines[i] = new AsciiLineAnalysis(i, _bodyLines[i], _lineAnalysisOptionsToTest));

			// for debugging activate the next line and paste the data into notepad:
			// PutRecognizedStructuresToClipboard(result, separationStrategies);

			EvaluateScoringOfAllLineAnalysisOptions();

			// Evaluate the best separation strategy. Store the value in _highestScoredSeparationStrategy and the corresponding line structure in _highestScoredLineStructure;
			EvaluateHighestScoredLineAnalysisOption();

			// look how many header lines are in the file by comparing the structure of the first lines  with the _highestScoredLineStructure
			if (null == importOptions.NumberOfMainHeaderLines)
				EvaluateNumberOfMainHeaderLines();
			else
				_numberOfMainHeaderLines = importOptions.NumberOfMainHeaderLines.Value;

			// get the index of the caption line
			if (null == importOptions.IndexOfCaptionLine)
				EvaluateIndexOfCaptionLine();
			else
				_indexOfCaptionLine = importOptions.IndexOfCaptionLine.Value;

			importOptions.NumberOfMainHeaderLines = _numberOfMainHeaderLines;
			importOptions.IndexOfCaptionLine = _indexOfCaptionLine;

			importOptions.SeparationStrategy = _highestScoredLineAnalysisOption.SeparationStrategy;
			importOptions.NumberFormatCulture = _highestScoredLineAnalysisOption.NumberFormat;
			importOptions.DateTimeFormatCulture = _highestScoredLineAnalysisOption.DateTimeFormat;

			importOptions.RecognizedStructure = _lineAnalysisOptionsScoring[_highestScoredLineAnalysisOption].LineStructure;
		}