/// <summary> /// Determines, which lines are the most fr /// </summary> /// <param name="analysisOption"></param> /// <param name="result"></param> /// <param name="excludeLineStructureHashes"></param> /// <param name="maxNumberOfEqualLines"></param> /// <param name="bestLine"></param> public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList <AsciiLineAnalysis> result, HashSet <int> excludeLineStructureHashes, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine) { // Dictionary, Key is the hash of the line structure hash, Value is the number of lines that have this hash var numberOfLinesForLineStructureHash = new Dictionary <int, int>(); bestLine = null; for (int i = 0; i < result.Count; i++) { AsciiLineAnalysis lineResults = result[i]; int lineStructureHash = lineResults[analysisOption].GetHashCode(); // and hash code if (numberOfLinesForLineStructureHash.ContainsKey(lineStructureHash)) { numberOfLinesForLineStructureHash[lineStructureHash] = 1 + numberOfLinesForLineStructureHash[lineStructureHash]; } else { numberOfLinesForLineStructureHash.Add(lineStructureHash, 1); } } // determine, which of the line structures is the most frequent one maxNumberOfEqualLines = 0; int hashOfMostFrequentStructure = 0; foreach (var dictEntry in numberOfLinesForLineStructureHash) { int lineStructureHash = dictEntry.Key; if (null != excludeLineStructureHashes && excludeLineStructureHashes.Contains(lineStructureHash)) { continue; } int numberOfLines = dictEntry.Value; if (maxNumberOfEqualLines < numberOfLines) { maxNumberOfEqualLines = numberOfLines; hashOfMostFrequentStructure = lineStructureHash; } } // for each // search for the maximum priority of those lines with the most frequent structure int maxPriorityOfMostFrequentLines = 0; for (int i = 0; i < result.Count; i++) { AsciiLineAnalysis lineResults = result[i]; if (hashOfMostFrequentStructure == lineResults[analysisOption].GetHashCode()) { int prty = lineResults[analysisOption].LineStructureScoring; if (prty >= maxPriorityOfMostFrequentLines) { maxPriorityOfMostFrequentLines = prty; bestLine = lineResults[analysisOption]; } } // if } // for // if the bestLine is a line with a column count of zero, we should use the next best line // we achieve this by adding the best hash to a list of excluded hashes and call the function again if (bestLine != null && bestLine.Count == 0) { if (null != excludeLineStructureHashes && !excludeLineStructureHashes.Contains(hashOfMostFrequentStructure)) { excludeLineStructureHashes.Add(hashOfMostFrequentStructure); CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine); return; } else if (null == excludeLineStructureHashes) { excludeLineStructureHashes = new HashSet <int>() { hashOfMostFrequentStructure }; CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine); return; } } }
/// <summary> /// Analyzes the first <code>nLines</code> of the ascii stream. /// </summary> /// <param name="importOptions">The import options. This can already contain known values. On return, this instance should be ready to be used to import ascii data, i.e. all fields should contain values unequal to <c>null</c>.</param> /// <param name="stream">The ascii stream to analyze.</param> /// <param name="analysisOptions">Options that specify how many lines are analyzed, and what number formats and date/time formats will be tested.</param> public void InternalAnalyze(AsciiImportOptions importOptions, System.IO.Stream stream, AsciiDocumentAnalysisOptions analysisOptions) { if (null == stream) { throw new ArgumentNullException("Stream"); } if (null == analysisOptions) { throw new ArgumentNullException("analysisOptions"); } if (null == importOptions) { throw new ArgumentNullException("importOptions"); } // Read-in the lines into _bodyLines. If the number of header lines is already known, those header lines are read into _headerLines ReadLinesToAnalyze(stream, analysisOptions.NumberOfLinesToAnalyze, importOptions.NumberOfMainHeaderLines); if (_bodyLines.Count == 0) { return; // there is nothing to analyze } // Analyze the whitespace structure of the body lines, find out if there is a fixed column width _globalStructure = new AsciiGlobalStructureAnalysis(_bodyLines); // Sets all separation strategies to test for. If importOptions already contain a separation strategy, only this separation strategy is set SetLineAnalysisOptionsToTest(importOptions, analysisOptions); // Analyze each of the first few lines with all possible separation strategies _lineAnalysisOfBodyLines = new AsciiLineAnalysis[_bodyLines.Count]; // Do the analysis itself in parallel for each of the lines System.Threading.Tasks.Parallel.For(0, _bodyLines.Count, (i) => _lineAnalysisOfBodyLines[i] = new AsciiLineAnalysis(i, _bodyLines[i], _lineAnalysisOptionsToTest)); // for debugging activate the next line and paste the data into notepad: // PutRecognizedStructuresToClipboard(result, separationStrategies); EvaluateScoringOfAllLineAnalysisOptions(); // Evaluate the best separation strategy. Store the value in _highestScoredSeparationStrategy and the corresponding line structure in _highestScoredLineStructure; EvaluateHighestScoredLineAnalysisOption(); // look how many header lines are in the file by comparing the structure of the first lines with the _highestScoredLineStructure if (null == importOptions.NumberOfMainHeaderLines) { EvaluateNumberOfMainHeaderLines(); } else { _numberOfMainHeaderLines = importOptions.NumberOfMainHeaderLines.Value; } // get the index of the caption line if (null == importOptions.IndexOfCaptionLine) { EvaluateIndexOfCaptionLine(); } else { _indexOfCaptionLine = importOptions.IndexOfCaptionLine.Value; } importOptions.NumberOfMainHeaderLines = _numberOfMainHeaderLines; importOptions.IndexOfCaptionLine = _indexOfCaptionLine; importOptions.SeparationStrategy = _highestScoredLineAnalysisOption.SeparationStrategy; importOptions.NumberFormatCulture = _highestScoredLineAnalysisOption.NumberFormat; importOptions.DateTimeFormatCulture = _highestScoredLineAnalysisOption.DateTimeFormat; importOptions.RecognizedStructure = _lineAnalysisOptionsScoring[_highestScoredLineAnalysisOption].LineStructure; }
/// <summary> /// Analyzes the first <code>nLines</code> of the ascii stream. /// </summary> /// <param name="importOptions">The import options. This can already contain known values. On return, this instance should be ready to be used to import ascii data, i.e. all fields should contain values unequal to <c>null</c>.</param> /// <param name="stream">The ascii stream to analyze.</param> /// <param name="analysisOptions">Options that specify how many lines are analyzed, and what number formats and date/time formats will be tested.</param> public void InternalAnalyze(AsciiImportOptions importOptions, System.IO.Stream stream, AsciiDocumentAnalysisOptions analysisOptions) { if (null == stream) throw new ArgumentNullException("Stream"); if (null == analysisOptions) throw new ArgumentNullException("analysisOptions"); if (null == importOptions) throw new ArgumentNullException("importOptions"); // Read-in the lines into _bodyLines. If the number of header lines is already known, those header lines are read into _headerLines ReadLinesToAnalyze(stream, analysisOptions.NumberOfLinesToAnalyze, importOptions.NumberOfMainHeaderLines); if (_bodyLines.Count == 0) return; // there is nothing to analyze // Analyze the whitespace structure of the body lines, find out if there is a fixed column width _globalStructure = new AsciiGlobalStructureAnalysis(_bodyLines); // Sets all separation strategies to test for. If importOptions already contain a separation strategy, only this separation strategy is set SetLineAnalysisOptionsToTest(importOptions, analysisOptions); // Analyze each of the first few lines with all possible separation strategies _lineAnalysisOfBodyLines = new AsciiLineAnalysis[_bodyLines.Count]; // Do the analysis itself in parallel for each of the lines System.Threading.Tasks.Parallel.For(0, _bodyLines.Count, (i) => _lineAnalysisOfBodyLines[i] = new AsciiLineAnalysis(i, _bodyLines[i], _lineAnalysisOptionsToTest)); // for debugging activate the next line and paste the data into notepad: // PutRecognizedStructuresToClipboard(result, separationStrategies); EvaluateScoringOfAllLineAnalysisOptions(); // Evaluate the best separation strategy. Store the value in _highestScoredSeparationStrategy and the corresponding line structure in _highestScoredLineStructure; EvaluateHighestScoredLineAnalysisOption(); // look how many header lines are in the file by comparing the structure of the first lines with the _highestScoredLineStructure if (null == importOptions.NumberOfMainHeaderLines) EvaluateNumberOfMainHeaderLines(); else _numberOfMainHeaderLines = importOptions.NumberOfMainHeaderLines.Value; // get the index of the caption line if (null == importOptions.IndexOfCaptionLine) EvaluateIndexOfCaptionLine(); else _indexOfCaptionLine = importOptions.IndexOfCaptionLine.Value; importOptions.NumberOfMainHeaderLines = _numberOfMainHeaderLines; importOptions.IndexOfCaptionLine = _indexOfCaptionLine; importOptions.SeparationStrategy = _highestScoredLineAnalysisOption.SeparationStrategy; importOptions.NumberFormatCulture = _highestScoredLineAnalysisOption.NumberFormat; importOptions.DateTimeFormatCulture = _highestScoredLineAnalysisOption.DateTimeFormat; importOptions.RecognizedStructure = _lineAnalysisOptionsScoring[_highestScoredLineAnalysisOption].LineStructure; }