Beispiel #1
0
        /// <summary>
        /// Determines whether this line structure is is compatible with another line structure.
        /// </summary>
        /// <param name="ano">The other line structure to compare with.</param>
        /// <returns><c>True</c> if this line structure is compatible with the line structure specified in <paramref name="ano"/>; otherwise, <c>false</c>.
        /// It is compatible if the values of all columns of this line structure could be stored in the columns specified by the other line structure.
        /// </returns>
        public bool IsCompatibleWith(AsciiLineStructure ano)
        {
            // our structure can have more columns, but not lesser than ano
            if (Count < ano.Count)
            {
                return(false);
            }

            for (int i = 0; i < ano.Count; i++)
            {
                if (!IsCompatibleWith(_recognizedTypes[i].ColumnType, ano._recognizedTypes[i].ColumnType))
                {
                    return(false);
                }
            }
            return(true);
        }
Beispiel #2
0
        /// <summary>
        /// Analyse the provided line of text with regard to one separation stragegy and returns the resulting structure.
        /// </summary>
        /// <param name="nLine">Line number.</param>
        /// <param name="tokens">The content of the line, already separated into tokens.</param>
        /// <param name="numberFormat">The number culture to use.</param>
        /// <param name="dateTimeFormat">The DateTime format culture to use.</param>
        /// <returns>The resulting structure.</returns>
        public static AsciiLineStructure GetStructure(int nLine, IEnumerable <string> tokens, System.Globalization.CultureInfo numberFormat, System.Globalization.CultureInfo dateTimeFormat)
        {
            var tabStruc = new AsciiLineStructure();

            foreach (string substring in tokens)
            {
                if (string.IsNullOrEmpty(substring)) // just this char is a tab, so nothing is between the last and this
                {
                    tabStruc.Add(AsciiColumnInfo.DBNull);
                }
                else if (IsNumeric(substring, numberFormat))
                {
                    if (IsIntegral(substring, numberFormat))
                    {
                        tabStruc.Add(AsciiColumnInfo.Integer);
                    }
                    else if (IsFloat(substring, numberFormat))
                    {
                        if (substring.Contains(numberFormat.NumberFormat.NumberDecimalSeparator))
                        {
                            tabStruc.Add(AsciiColumnInfo.FloatWithDecimalSeparator);
                        }
                        else
                        {
                            tabStruc.Add(AsciiColumnInfo.FloatWithoutDecimalSeparator);
                        }
                    }
                    else
                    {
                        tabStruc.Add(AsciiColumnInfo.GeneralNumber);
                    }
                }
                else if (IsDateTime(substring, dateTimeFormat))
                {
                    tabStruc.Add(AsciiColumnInfo.DateTime);
                }
                else
                {
                    tabStruc.Add(AsciiColumnInfo.Text);
                }
            } // end for
            return(tabStruc);
        }
 /// <summary>
 /// calculates the priority of the result
 /// </summary>
 /// <param name="result"></param>
 /// <param name="bestLine"></param>
 /// <param name="sep"></param>
 /// <returns></returns>
 public static int GetPriorityOf(System.Collections.ArrayList result, AsciiLineAnalyzer.Separation sep, ref AsciiLineStructure bestLine)
 {
   System.Collections.Hashtable sl = new System.Collections.Hashtable();
   bestLine=null;
   for(int i=0;i<result.Count;i++)
   {
     AsciiLineAnalyzer ala = (AsciiLineAnalyzer)result[i];
     int p  = ((AsciiLineAnalyzer)result[i]).structure[(int)sep].GetHashCode(); // and hash code
     if(null==sl[p])
       sl.Add(p,1);
     else 
       sl[p] = 1+(int)sl[p];
   }
   // get the count with the topmost frequency
   int nNumberOfMaxSame = 0;
   int nHashOfMaxSame = 0;
   foreach(System.Collections.DictionaryEntry ohash in sl)
   {
     int hash = (int)ohash.Key;
     int cnt = (int)ohash.Value;
     if(nNumberOfMaxSame<cnt)
     {
       nNumberOfMaxSame  = cnt;
       nHashOfMaxSame = hash;
     }
   } // for each
   // search for the max priority of the hash
   int nMaxPriorityOfMaxSame=0;
   for(int i=0;i<result.Count;i++)
   {
     AsciiLineAnalyzer ala = (AsciiLineAnalyzer)result[i];
     if(nHashOfMaxSame == ((AsciiLineAnalyzer)result[i]).structure[(int)sep].GetHashCode())
     {
       int prty = ((AsciiLineAnalyzer)result[i]).structure[(int)sep].Priority;
       if(prty>nMaxPriorityOfMaxSame)
       {
         nMaxPriorityOfMaxSame = prty;
         bestLine = ((AsciiLineAnalyzer)result[i]).structure[(int)sep];
       }
     }// if
   } // for
   return nNumberOfMaxSame;
 }
Beispiel #4
0
        public bool IsCompatibleWith(AsciiLineStructure ano)
        {
            // our structure can have more columns, but not lesser than ano
            if (this.Count < ano.Count)
            {
                return(false);
            }

            for (int i = 0; i < ano.Count; i++)
            {
                if (this[i] == typeof(DBNull) || ano[i] == typeof(DBNull))
                {
                    continue;
                }
                if (this[i] != ano[i])
                {
                    return(false);
                }
            }
            return(true);
        }
        public static AsciiLineStructure AssumeSeparator(int nLine, string sLine, string separator)
        {
            AsciiLineStructure tabStruc = new AsciiLineStructure();

            tabStruc.LineNumber = nLine;

            int len = sLine.Length;
            int ix  = 0;

            for (int start = 0; start <= len; start = ix + 1)
            {
                ix = sLine.IndexOf(separator, start, len - start);
                if (ix == -1)
                {
                    ix = len;
                }

                // try to interpret ix first as DateTime, then as numeric and then as string
                string substring = sLine.Substring(start, ix - start);
                if (ix == start) // just this char is a tab, so nothing is between the last and this
                {
                    tabStruc.Add(typeof(DBNull));
                }
                else if (IsNumeric(substring))
                {
                    tabStruc.Add(typeof(double));
                    tabStruc.AddToDecimalSeparatorStatistics(substring); // make a statistics of the use of decimal separator
                }
                else if (IsDateTime(substring))
                {
                    tabStruc.Add(typeof(System.DateTime));
                }
                else
                {
                    tabStruc.Add(typeof(string));
                }
            } // end for
            return(tabStruc);
        }
Beispiel #6
0
		/// <summary>
		/// Analyse the provided line of text with regard to one separation stragegy and returns the resulting structure.
		/// </summary>
		/// <param name="nLine">Line number.</param>
		/// <param name="tokens">The content of the line, already separated into tokens.</param>
		/// <param name="numberFormat">The number culture to use.</param>
		/// <param name="dateTimeFormat">The DateTime format culture to use.</param>
		/// <returns>The resulting structure.</returns>
		public static AsciiLineStructure GetStructure(int nLine, IEnumerable<string> tokens, System.Globalization.CultureInfo numberFormat, System.Globalization.CultureInfo dateTimeFormat)
		{
			AsciiLineStructure tabStruc = new AsciiLineStructure();

			foreach (string substring in tokens)
			{
				if (string.IsNullOrEmpty(substring)) // just this char is a tab, so nothing is between the last and this
				{
					tabStruc.Add(AsciiColumnInfo.DBNull);
				}
				else if (IsNumeric(substring, numberFormat))
				{
					if (IsIntegral(substring, numberFormat))
					{
						tabStruc.Add(AsciiColumnInfo.Integer);
					}
					else if (IsFloat(substring, numberFormat))
					{
						if (substring.Contains(numberFormat.NumberFormat.NumberDecimalSeparator))
							tabStruc.Add(AsciiColumnInfo.FloatWithDecimalSeparator);
						else
							tabStruc.Add(AsciiColumnInfo.FloatWithoutDecimalSeparator);
					}
					else
					{
						tabStruc.Add(AsciiColumnInfo.GeneralNumber);
					}
				}
				else if (IsDateTime(substring, dateTimeFormat))
				{
					tabStruc.Add(AsciiColumnInfo.DateTime);
				}
				else
				{
					tabStruc.Add(AsciiColumnInfo.Text);
				}
			} // end for
			return tabStruc;
		}
Beispiel #7
0
        /// <summary>
        /// Evaluates the highest scored separation strategy, and stores the winning separation strategy in <see cref="_highestScoredLineAnalysisOption"/> and the corresponding line structure in <see cref="_highestScoredLineStructure"/>.
        /// </summary>
        private void EvaluateHighestScoredLineAnalysisOption()
        {
            // determine, which of the separation strategies results in the topmost total priority (product of number of lines and best line priority)
            double maxScore       = int.MinValue;
            var    maxScoredEntry = _lineAnalysisOptionsScoring.First();

            foreach (var entry in _lineAnalysisOptionsScoring)
            {
                double score = (double)entry.Value.NumberOfLines * entry.Value.LineStructure.LineStructureScoring;

                if (score > maxScore)
                {
                    maxScore       = score;
                    maxScoredEntry = entry;
                }
                else if (score == maxScore && entry.Value.NumberOfLines > maxScoredEntry.Value.NumberOfLines)
                {
                    maxScoredEntry = entry;
                }
            }
            _highestScoredLineAnalysisOption = maxScoredEntry.Key;
            _highestScoredLineStructure      = maxScoredEntry.Value.LineStructure;
        }
Beispiel #8
0
        /// <summary>
        /// Determines, which lines are the most fr
        /// </summary>
        /// <param name="analysisOption"></param>
        /// <param name="result"></param>
        /// <param name="excludeLineStructureHashes"></param>
        /// <param name="maxNumberOfEqualLines"></param>
        /// <param name="bestLine"></param>
        public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList <AsciiLineAnalysis> result, HashSet <int> excludeLineStructureHashes, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine)
        {
            // Dictionary, Key is the hash of the line structure hash, Value is the number of lines that have this hash
            var numberOfLinesForLineStructureHash = new Dictionary <int, int>();

            bestLine = null;
            for (int i = 0; i < result.Count; i++)
            {
                AsciiLineAnalysis lineResults = result[i];
                int lineStructureHash         = lineResults[analysisOption].GetHashCode(); // and hash code
                if (numberOfLinesForLineStructureHash.ContainsKey(lineStructureHash))
                {
                    numberOfLinesForLineStructureHash[lineStructureHash] = 1 + numberOfLinesForLineStructureHash[lineStructureHash];
                }
                else
                {
                    numberOfLinesForLineStructureHash.Add(lineStructureHash, 1);
                }
            }

            // determine, which of the line structures is the most frequent one
            maxNumberOfEqualLines = 0;
            int hashOfMostFrequentStructure = 0;

            foreach (var dictEntry in numberOfLinesForLineStructureHash)
            {
                int lineStructureHash = dictEntry.Key;

                if (null != excludeLineStructureHashes && excludeLineStructureHashes.Contains(lineStructureHash))
                {
                    continue;
                }

                int numberOfLines = dictEntry.Value;
                if (maxNumberOfEqualLines < numberOfLines)
                {
                    maxNumberOfEqualLines       = numberOfLines;
                    hashOfMostFrequentStructure = lineStructureHash;
                }
            } // for each

            // search for the maximum priority of those lines with the most frequent structure
            int maxPriorityOfMostFrequentLines = 0;

            for (int i = 0; i < result.Count; i++)
            {
                AsciiLineAnalysis lineResults = result[i];
                if (hashOfMostFrequentStructure == lineResults[analysisOption].GetHashCode())
                {
                    int prty = lineResults[analysisOption].LineStructureScoring;
                    if (prty >= maxPriorityOfMostFrequentLines)
                    {
                        maxPriorityOfMostFrequentLines = prty;
                        bestLine = lineResults[analysisOption];
                    }
                } // if
            }     // for

            // if the bestLine is a line with a column count of zero, we should use the next best line
            // we achieve this by adding the best hash to a list of excluded hashes and call the function again
            if (bestLine != null && bestLine.Count == 0)
            {
                if (null != excludeLineStructureHashes && !excludeLineStructureHashes.Contains(hashOfMostFrequentStructure))
                {
                    excludeLineStructureHashes.Add(hashOfMostFrequentStructure);
                    CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine);
                    return;
                }
                else if (null == excludeLineStructureHashes)
                {
                    excludeLineStructureHashes = new HashSet <int>()
                    {
                        hashOfMostFrequentStructure
                    };
                    CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine);
                    return;
                }
            }
        }
Beispiel #9
0
 /// <summary>
 /// Determines, which lines are the most fr
 /// </summary>
 /// <param name="analysisOption"></param>
 /// <param name="result"></param>
 /// <param name="maxNumberOfEqualLines"></param>
 /// <param name="bestLine"></param>
 public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList <AsciiLineAnalysis> result, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine)
 {
     CalculateScoreOfLineAnalysisOption(analysisOption, result, null, out maxNumberOfEqualLines, out bestLine);
 }
Beispiel #10
0
		/// <summary>
		/// Determines, which lines are the most fr
		/// </summary>
		/// <param name="analysisOption"></param>
		/// <param name="result"></param>
		/// <param name="excludeLineStructureHashes"></param>
		/// <param name="maxNumberOfEqualLines"></param>
		/// <param name="bestLine"></param>
		public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList<AsciiLineAnalysis> result, HashSet<int> excludeLineStructureHashes, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine)
		{
			// Dictionary, Key is the hash of the line structure hash, Value is the number of lines that have this hash
			Dictionary<int, int> numberOfLinesForLineStructureHash = new Dictionary<int, int>();

			bestLine = null;
			for (int i = 0; i < result.Count; i++)
			{
				AsciiLineAnalysis lineResults = result[i];
				int lineStructureHash = lineResults[analysisOption].GetHashCode(); // and hash code
				if (numberOfLinesForLineStructureHash.ContainsKey(lineStructureHash))
					numberOfLinesForLineStructureHash[lineStructureHash] = 1 + numberOfLinesForLineStructureHash[lineStructureHash];
				else
					numberOfLinesForLineStructureHash.Add(lineStructureHash, 1);
			}

			// determine, which of the line structures is the most frequent one
			maxNumberOfEqualLines = 0;
			int hashOfMostFrequentStructure = 0;
			foreach (var dictEntry in numberOfLinesForLineStructureHash)
			{
				int lineStructureHash = dictEntry.Key;

				if (null != excludeLineStructureHashes && excludeLineStructureHashes.Contains(lineStructureHash))
					continue;

				int numberOfLines = dictEntry.Value;
				if (maxNumberOfEqualLines < numberOfLines)
				{
					maxNumberOfEqualLines = numberOfLines;
					hashOfMostFrequentStructure = lineStructureHash;
				}
			} // for each

			// search for the maximum priority of those lines with the most frequent structure
			int maxPriorityOfMostFrequentLines = 0;
			for (int i = 0; i < result.Count; i++)
			{
				AsciiLineAnalysis lineResults = result[i];
				if (hashOfMostFrequentStructure == lineResults[analysisOption].GetHashCode())
				{
					int prty = lineResults[analysisOption].LineStructureScoring;
					if (prty >= maxPriorityOfMostFrequentLines)
					{
						maxPriorityOfMostFrequentLines = prty;
						bestLine = lineResults[analysisOption];
					}
				}// if
			} // for

			// if the bestLine is a line with a column count of zero, we should use the next best line
			// we achieve this by adding the best hash to a list of excluded hashes and call the function again
			if (bestLine != null && bestLine.Count == 0)
			{
				if (null != excludeLineStructureHashes && !excludeLineStructureHashes.Contains(hashOfMostFrequentStructure))
				{
					excludeLineStructureHashes.Add(hashOfMostFrequentStructure);
					CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine);
					return;
				}
				else if (null == excludeLineStructureHashes)
				{
					excludeLineStructureHashes = new HashSet<int>() { hashOfMostFrequentStructure };
					CalculateScoreOfLineAnalysisOption(analysisOption, result, excludeLineStructureHashes, out maxNumberOfEqualLines, out bestLine);
					return;
				}
			}
		}
Beispiel #11
0
		/// <summary>
		/// Determines, which lines are the most fr
		/// </summary>
		/// <param name="analysisOption"></param>
		/// <param name="result"></param>
		/// <param name="maxNumberOfEqualLines"></param>
		/// <param name="bestLine"></param>
		public static void CalculateScoreOfLineAnalysisOption(AsciiLineAnalysisOption analysisOption, IList<AsciiLineAnalysis> result, out int maxNumberOfEqualLines, out AsciiLineStructure bestLine)
		{
			CalculateScoreOfLineAnalysisOption(analysisOption, result, null, out maxNumberOfEqualLines, out bestLine);
		}
Beispiel #12
0
		/// <summary>
		/// Evaluates the highest scored separation strategy, and stores the winning separation strategy in <see cref="_highestScoredLineAnalysisOption"/> and the corresponding line structure in <see cref="_highestScoredLineStructure"/>.
		/// </summary>
		private void EvaluateHighestScoredLineAnalysisOption()
		{
			// determine, which of the separation strategies results in the topmost total priority (product of number of lines and best line priority)
			double maxScore = int.MinValue;
			var maxScoredEntry = _lineAnalysisOptionsScoring.First();
			foreach (var entry in _lineAnalysisOptionsScoring)
			{
				double score = (double)entry.Value.NumberOfLines * entry.Value.LineStructure.LineStructureScoring;

				if (score > maxScore)
				{
					maxScore = score;
					maxScoredEntry = entry;
				}
				else if (score == maxScore && entry.Value.NumberOfLines > maxScoredEntry.Value.NumberOfLines)
				{
					maxScoredEntry = entry;
				}
			}
			_highestScoredLineAnalysisOption = maxScoredEntry.Key;
			_highestScoredLineStructure = maxScoredEntry.Value.LineStructure;
		}
Beispiel #13
0
 public CollectionWrapper(AsciiLineStructure parent)
 {
     _parent = parent;
 }
    public bool IsCompatibleWith(AsciiLineStructure ano)
    {
      // our structure can have more columns, but not lesser than ano
      if(this.Count<ano.Count)
        return false;

      for(int i=0;i<ano.Count;i++)
      {
        if(this[i]==typeof(DBNull) || ano[i]==typeof(DBNull))
          continue;
        if(this[i]!=ano[i])
          return false;
      }
      return true;
    }
    public static AsciiLineStructure AssumeSeparator(int nLine, string sLine, string separator)
    {
      AsciiLineStructure tabStruc = new AsciiLineStructure();
      tabStruc.LineNumber = nLine;

      int len =sLine.Length;
      int ix=0;
      for(int start=0; start<=len; start=ix+1)
      {
        ix = sLine.IndexOf(separator,start,len-start);
        if(ix==-1)
        {
          ix = len;
        }

        // try to interpret ix first as DateTime, then as numeric and then as string
        string substring = sLine.Substring(start,ix-start);
        if(ix==start) // just this char is a tab, so nothing is between the last and this
        {
          tabStruc.Add(typeof(DBNull));
        }
        else if(IsNumeric(substring))
        {
          tabStruc.Add(typeof(double));
          tabStruc.AddToDecimalSeparatorStatistics(substring); // make a statistics of the use of decimal separator
        }
        else if(IsDateTime(substring))
        {
          tabStruc.Add(typeof(System.DateTime));
        }
        else
        {
          tabStruc.Add(typeof(string));
        }
      } // end for
      return tabStruc;
    }
Beispiel #16
0
        /// <summary>
        /// calculates the priority of the result
        /// </summary>
        /// <param name="result"></param>
        /// <param name="bestLine"></param>
        /// <param name="sep"></param>
        /// <returns></returns>
        public static int GetPriorityOf(System.Collections.ArrayList result, AsciiLineAnalyzer.Separation sep, ref AsciiLineStructure bestLine)
        {
            System.Collections.Hashtable sl = new System.Collections.Hashtable();
            bestLine = null;
            for (int i = 0; i < result.Count; i++)
            {
                AsciiLineAnalyzer ala = (AsciiLineAnalyzer)result[i];
                int p = ((AsciiLineAnalyzer)result[i]).structure[(int)sep].GetHashCode(); // and hash code
                if (null == sl[p])
                {
                    sl.Add(p, 1);
                }
                else
                {
                    sl[p] = 1 + (int)sl[p];
                }
            }
            // get the count with the topmost frequency
            int nNumberOfMaxSame = 0;
            int nHashOfMaxSame   = 0;

            foreach (System.Collections.DictionaryEntry ohash in sl)
            {
                int hash = (int)ohash.Key;
                int cnt  = (int)ohash.Value;
                if (nNumberOfMaxSame < cnt)
                {
                    nNumberOfMaxSame = cnt;
                    nHashOfMaxSame   = hash;
                }
            } // for each
            // search for the max priority of the hash
            int nMaxPriorityOfMaxSame = 0;

            for (int i = 0; i < result.Count; i++)
            {
                AsciiLineAnalyzer ala = (AsciiLineAnalyzer)result[i];
                if (nHashOfMaxSame == ((AsciiLineAnalyzer)result[i]).structure[(int)sep].GetHashCode())
                {
                    int prty = ((AsciiLineAnalyzer)result[i]).structure[(int)sep].Priority;
                    if (prty > nMaxPriorityOfMaxSame)
                    {
                        nMaxPriorityOfMaxSame = prty;
                        bestLine = ((AsciiLineAnalyzer)result[i]).structure[(int)sep];
                    }
                } // if
            }     // for
            return(nNumberOfMaxSame);
        }