/// <summary> /// Determines whether this line structure is is compatible with another line structure. /// </summary> /// <param name="ano">The other line structure to compare with.</param> /// <returns><c>True</c> if this line structure is compatible with the line structure specified in <paramref name="ano"/>; otherwise, <c>false</c>. /// It is compatible if the values of all columns of this line structure could be stored in the columns specified by the other line structure. /// </returns> public bool IsCompatibleWith(AsciiLineStructure ano) { // our structure can have more columns, but not lesser than ano if (Count < ano.Count) { return(false); } for (int i = 0; i < ano.Count; i++) { if (!IsCompatibleWith(_recognizedTypes[i].ColumnType, ano._recognizedTypes[i].ColumnType)) { return(false); } } return(true); }
/// <summary> /// Analyse the provided line of text with regard to one separation stragegy and returns the resulting structure. /// </summary> /// <param name="nLine">Line number.</param> /// <param name="tokens">The content of the line, already separated into tokens.</param> /// <param name="numberFormat">The number culture to use.</param> /// <param name="dateTimeFormat">The DateTime format culture to use.</param> /// <returns>The resulting structure.</returns> public static AsciiLineStructure GetStructure(int nLine, IEnumerable <string> tokens, System.Globalization.CultureInfo numberFormat, System.Globalization.CultureInfo dateTimeFormat) { var tabStruc = new AsciiLineStructure(); foreach (string substring in tokens) { if (string.IsNullOrEmpty(substring)) // just this char is a tab, so nothing is between the last and this { tabStruc.Add(AsciiColumnInfo.DBNull); } else if (IsNumeric(substring, numberFormat)) { if (IsIntegral(substring, numberFormat)) { tabStruc.Add(AsciiColumnInfo.Integer); } else if (IsFloat(substring, numberFormat)) { if (substring.Contains(numberFormat.NumberFormat.NumberDecimalSeparator)) { tabStruc.Add(AsciiColumnInfo.FloatWithDecimalSeparator); } else { tabStruc.Add(AsciiColumnInfo.FloatWithoutDecimalSeparator); } } else { tabStruc.Add(AsciiColumnInfo.GeneralNumber); } } else if (IsDateTime(substring, dateTimeFormat)) { tabStruc.Add(AsciiColumnInfo.DateTime); } else { tabStruc.Add(AsciiColumnInfo.Text); } } // end for return(tabStruc); }
/// <summary> /// calculates the priority of the result /// </summary> /// <param name="result"></param> /// <param name="bestLine"></param> /// <param name="sep"></param> /// <returns></returns> public static int GetPriorityOf(System.Collections.ArrayList result, AsciiLineAnalyzer.Separation sep, ref AsciiLineStructure bestLine) { System.Collections.Hashtable sl = new System.Collections.Hashtable(); bestLine=null; for(int i=0;i<result.Count;i++) { AsciiLineAnalyzer ala = (AsciiLineAnalyzer)result[i]; int p = ((AsciiLineAnalyzer)result[i]).structure[(int)sep].GetHashCode(); // and hash code if(null==sl[p]) sl.Add(p,1); else sl[p] = 1+(int)sl[p]; } // get the count with the topmost frequency int nNumberOfMaxSame = 0; int nHashOfMaxSame = 0; foreach(System.Collections.DictionaryEntry ohash in sl) { int hash = (int)ohash.Key; int cnt = (int)ohash.Value; if(nNumberOfMaxSame<cnt) { nNumberOfMaxSame = cnt; nHashOfMaxSame = hash; } } // for each // search for the max priority of the hash int nMaxPriorityOfMaxSame=0; for(int i=0;i<result.Count;i++) { AsciiLineAnalyzer ala = (AsciiLineAnalyzer)result[i]; if(nHashOfMaxSame == ((AsciiLineAnalyzer)result[i]).structure[(int)sep].GetHashCode()) { int prty = ((AsciiLineAnalyzer)result[i]).structure[(int)sep].Priority; if(prty>nMaxPriorityOfMaxSame) { nMaxPriorityOfMaxSame = prty; bestLine = ((AsciiLineAnalyzer)result[i]).structure[(int)sep]; } }// if } // for return nNumberOfMaxSame; }
public bool IsCompatibleWith(AsciiLineStructure ano) { // our structure can have more columns, but not lesser than ano if (this.Count < ano.Count) { return(false); } for (int i = 0; i < ano.Count; i++) { if (this[i] == typeof(DBNull) || ano[i] == typeof(DBNull)) { continue; } if (this[i] != ano[i]) { return(false); } } return(true); }
public static AsciiLineStructure AssumeSeparator(int nLine, string sLine, string separator) { AsciiLineStructure tabStruc = new AsciiLineStructure(); tabStruc.LineNumber = nLine; int len = sLine.Length; int ix = 0; for (int start = 0; start <= len; start = ix + 1) { ix = sLine.IndexOf(separator, start, len - start); if (ix == -1) { ix = len; } // try to interpret ix first as DateTime, then as numeric and then as string string substring = sLine.Substring(start, ix - start); if (ix == start) // just this char is a tab, so nothing is between the last and this { tabStruc.Add(typeof(DBNull)); } else if (IsNumeric(substring)) { tabStruc.Add(typeof(double)); tabStruc.AddToDecimalSeparatorStatistics(substring); // make a statistics of the use of decimal separator } else if (IsDateTime(substring)) { tabStruc.Add(typeof(System.DateTime)); } else { tabStruc.Add(typeof(string)); } } // end for return(tabStruc); }
/// <summary> /// Analyse the provided line of text with regard to one separation stragegy and returns the resulting structure. /// </summary> /// <param name="nLine">Line number.</param> /// <param name="tokens">The content of the line, already separated into tokens.</param> /// <param name="numberFormat">The number culture to use.</param> /// <param name="dateTimeFormat">The DateTime format culture to use.</param> /// <returns>The resulting structure.</returns> public static AsciiLineStructure GetStructure(int nLine, IEnumerable<string> tokens, System.Globalization.CultureInfo numberFormat, System.Globalization.CultureInfo dateTimeFormat) { AsciiLineStructure tabStruc = new AsciiLineStructure(); foreach (string substring in tokens) { if (string.IsNullOrEmpty(substring)) // just this char is a tab, so nothing is between the last and this { tabStruc.Add(AsciiColumnInfo.DBNull); } else if (IsNumeric(substring, numberFormat)) { if (IsIntegral(substring, numberFormat)) { tabStruc.Add(AsciiColumnInfo.Integer); } else if (IsFloat(substring, numberFormat)) { if (substring.Contains(numberFormat.NumberFormat.NumberDecimalSeparator)) tabStruc.Add(AsciiColumnInfo.FloatWithDecimalSeparator); else tabStruc.Add(AsciiColumnInfo.FloatWithoutDecimalSeparator); } else { tabStruc.Add(AsciiColumnInfo.GeneralNumber); } } else if (IsDateTime(substring, dateTimeFormat)) { tabStruc.Add(AsciiColumnInfo.DateTime); } else { tabStruc.Add(AsciiColumnInfo.Text); } } // end for return tabStruc; }
/// <summary> /// Evaluates the highest scored separation strategy, and stores the winning separation strategy in <see cref="_highestScoredLineAnalysisOption"/> and the corresponding line structure in <see cref="_highestScoredLineStructure"/>. /// </summary> private void EvaluateHighestScoredLineAnalysisOption() { // determine, which of the separation strategies results in the topmost total priority (product of number of lines and best line priority) double maxScore = int.MinValue; var maxScoredEntry = _lineAnalysisOptionsScoring.First(); foreach (var entry in _lineAnalysisOptionsScoring) { double score = (double)entry.Value.NumberOfLines * entry.Value.LineStructure.LineStructureScoring; if (score > maxScore) { maxScore = score; maxScoredEntry = entry; } else if (score == maxScore && entry.Value.NumberOfLines > maxScoredEntry.Value.NumberOfLines) { maxScoredEntry = entry; } } _highestScoredLineAnalysisOption = maxScoredEntry.Key; _highestScoredLineStructure = maxScoredEntry.Value.LineStructure; }
/// <summary> /// Determines, which lines are the most fr /// </summary> /// <param name="analysisOption"></param> /// <param name="result"></param> /// <param name="excludeLineStructureHashes"></param> /// <param name="maxNumberOfEqualLines"></param> /// <param name="bestLine"></param> public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList <AsciiLineAnalysis> result, HashSet <int> excludeLineStructureHashes, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine) { // Dictionary, Key is the hash of the line structure hash, Value is the number of lines that have this hash var numberOfLinesForLineStructureHash = new Dictionary <int, int>(); bestLine = null; for (int i = 0; i < result.Count; i++) { AsciiLineAnalysis lineResults = result[i]; int lineStructureHash = lineResults[analysisOption].GetHashCode(); // and hash code if (numberOfLinesForLineStructureHash.ContainsKey(lineStructureHash)) { numberOfLinesForLineStructureHash[lineStructureHash] = 1 + numberOfLinesForLineStructureHash[lineStructureHash]; } else { numberOfLinesForLineStructureHash.Add(lineStructureHash, 1); } } // determine, which of the line structures is the most frequent one maxNumberOfEqualLines = 0; int hashOfMostFrequentStructure = 0; foreach (var dictEntry in numberOfLinesForLineStructureHash) { int lineStructureHash = dictEntry.Key; if (null != excludeLineStructureHashes && excludeLineStructureHashes.Contains(lineStructureHash)) { continue; } int numberOfLines = dictEntry.Value; if (maxNumberOfEqualLines < numberOfLines) { maxNumberOfEqualLines = numberOfLines; hashOfMostFrequentStructure = lineStructureHash; } } // for each // search for the maximum priority of those lines with the most frequent structure int maxPriorityOfMostFrequentLines = 0; for (int i = 0; i < result.Count; i++) { AsciiLineAnalysis lineResults = result[i]; if (hashOfMostFrequentStructure == lineResults[analysisOption].GetHashCode()) { int prty = lineResults[analysisOption].LineStructureScoring; if (prty >= maxPriorityOfMostFrequentLines) { maxPriorityOfMostFrequentLines = prty; bestLine = lineResults[analysisOption]; } } // if } // for // if the bestLine is a line with a column count of zero, we should use the next best line // we achieve this by adding the best hash to a list of excluded hashes and call the function again if (bestLine != null && bestLine.Count == 0) { if (null != excludeLineStructureHashes && !excludeLineStructureHashes.Contains(hashOfMostFrequentStructure)) { excludeLineStructureHashes.Add(hashOfMostFrequentStructure); CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine); return; } else if (null == excludeLineStructureHashes) { excludeLineStructureHashes = new HashSet <int>() { hashOfMostFrequentStructure }; CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine); return; } } }
/// <summary> /// Determines, which lines are the most fr /// </summary> /// <param name="analysisOption"></param> /// <param name="result"></param> /// <param name="maxNumberOfEqualLines"></param> /// <param name="bestLine"></param> public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList <AsciiLineAnalysis> result, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine) { CalculateScoreOfLineAnalysisOption(analysisOption, result, null, out maxNumberOfEqualLines, out bestLine); }
/// <summary> /// Determines, which lines are the most fr /// </summary> /// <param name="analysisOption"></param> /// <param name="result"></param> /// <param name="excludeLineStructureHashes"></param> /// <param name="maxNumberOfEqualLines"></param> /// <param name="bestLine"></param> public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList<AsciiLineAnalysis> result, HashSet<int> excludeLineStructureHashes, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine) { // Dictionary, Key is the hash of the line structure hash, Value is the number of lines that have this hash Dictionary<int, int> numberOfLinesForLineStructureHash = new Dictionary<int, int>(); bestLine = null; for (int i = 0; i < result.Count; i++) { AsciiLineAnalysis lineResults = result[i]; int lineStructureHash = lineResults[analysisOption].GetHashCode(); // and hash code if (numberOfLinesForLineStructureHash.ContainsKey(lineStructureHash)) numberOfLinesForLineStructureHash[lineStructureHash] = 1 + numberOfLinesForLineStructureHash[lineStructureHash]; else numberOfLinesForLineStructureHash.Add(lineStructureHash, 1); } // determine, which of the line structures is the most frequent one maxNumberOfEqualLines = 0; int hashOfMostFrequentStructure = 0; foreach (var dictEntry in numberOfLinesForLineStructureHash) { int lineStructureHash = dictEntry.Key; if (null != excludeLineStructureHashes && excludeLineStructureHashes.Contains(lineStructureHash)) continue; int numberOfLines = dictEntry.Value; if (maxNumberOfEqualLines < numberOfLines) { maxNumberOfEqualLines = numberOfLines; hashOfMostFrequentStructure = lineStructureHash; } } // for each // search for the maximum priority of those lines with the most frequent structure int maxPriorityOfMostFrequentLines = 0; for (int i = 0; i < result.Count; i++) { AsciiLineAnalysis lineResults = result[i]; if (hashOfMostFrequentStructure == lineResults[analysisOption].GetHashCode()) { int prty = lineResults[analysisOption].LineStructureScoring; if (prty >= maxPriorityOfMostFrequentLines) { maxPriorityOfMostFrequentLines = prty; bestLine = lineResults[analysisOption]; } }// if } // for // if the bestLine is a line with a column count of zero, we should use the next best line // we achieve this by adding the best hash to a list of excluded hashes and call the function again if (bestLine != null && bestLine.Count == 0) { if (null != excludeLineStructureHashes && !excludeLineStructureHashes.Contains(hashOfMostFrequentStructure)) { excludeLineStructureHashes.Add(hashOfMostFrequentStructure); CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine); return; } else if (null == excludeLineStructureHashes) { excludeLineStructureHashes = new HashSet<int>() { hashOfMostFrequentStructure }; CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine); return; } } }
/// <summary> /// Determines, which lines are the most fr /// </summary> /// <param name="analysisOption"></param> /// <param name="result"></param> /// <param name="maxNumberOfEqualLines"></param> /// <param name="bestLine"></param> public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList<AsciiLineAnalysis> result, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine) { CalculateScoreOfLineAnalysisOption(analysisOption, result, null, out maxNumberOfEqualLines, out bestLine); }
public CollectionWrapper(AsciiLineStructure parent) { _parent = parent; }
public bool IsCompatibleWith(AsciiLineStructure ano) { // our structure can have more columns, but not lesser than ano if(this.Count<ano.Count) return false; for(int i=0;i<ano.Count;i++) { if(this[i]==typeof(DBNull) || ano[i]==typeof(DBNull)) continue; if(this[i]!=ano[i]) return false; } return true; }
public static AsciiLineStructure AssumeSeparator(int nLine, string sLine, string separator) { AsciiLineStructure tabStruc = new AsciiLineStructure(); tabStruc.LineNumber = nLine; int len =sLine.Length; int ix=0; for(int start=0; start<=len; start=ix+1) { ix = sLine.IndexOf(separator,start,len-start); if(ix==-1) { ix = len; } // try to interpret ix first as DateTime, then as numeric and then as string string substring = sLine.Substring(start,ix-start); if(ix==start) // just this char is a tab, so nothing is between the last and this { tabStruc.Add(typeof(DBNull)); } else if(IsNumeric(substring)) { tabStruc.Add(typeof(double)); tabStruc.AddToDecimalSeparatorStatistics(substring); // make a statistics of the use of decimal separator } else if(IsDateTime(substring)) { tabStruc.Add(typeof(System.DateTime)); } else { tabStruc.Add(typeof(string)); } } // end for return tabStruc; }
/// <summary> /// calculates the priority of the result /// </summary> /// <param name="result"></param> /// <param name="bestLine"></param> /// <param name="sep"></param> /// <returns></returns> public static int GetPriorityOf(System.Collections.ArrayList result, AsciiLineAnalyzer.Separation sep, ref AsciiLineStructure bestLine) { System.Collections.Hashtable sl = new System.Collections.Hashtable(); bestLine = null; for (int i = 0; i < result.Count; i++) { AsciiLineAnalyzer ala = (AsciiLineAnalyzer)result[i]; int p = ((AsciiLineAnalyzer)result[i]).structure[(int)sep].GetHashCode(); // and hash code if (null == sl[p]) { sl.Add(p, 1); } else { sl[p] = 1 + (int)sl[p]; } } // get the count with the topmost frequency int nNumberOfMaxSame = 0; int nHashOfMaxSame = 0; foreach (System.Collections.DictionaryEntry ohash in sl) { int hash = (int)ohash.Key; int cnt = (int)ohash.Value; if (nNumberOfMaxSame < cnt) { nNumberOfMaxSame = cnt; nHashOfMaxSame = hash; } } // for each // search for the max priority of the hash int nMaxPriorityOfMaxSame = 0; for (int i = 0; i < result.Count; i++) { AsciiLineAnalyzer ala = (AsciiLineAnalyzer)result[i]; if (nHashOfMaxSame == ((AsciiLineAnalyzer)result[i]).structure[(int)sep].GetHashCode()) { int prty = ((AsciiLineAnalyzer)result[i]).structure[(int)sep].Priority; if (prty > nMaxPriorityOfMaxSame) { nMaxPriorityOfMaxSame = prty; bestLine = ((AsciiLineAnalyzer)result[i]).structure[(int)sep]; } } // if } // for return(nNumberOfMaxSame); }