예제 #1
0
        /// <summary>
        /// Analyzes the first <code>nLines</code> of the ascii stream.
        /// </summary>
        /// <param name="nLines">The number of lines to analyze. It is no error if the stream contains a less number of lines than provided here.</param>
        /// <param name="defaultImportOptions">The default import options.</param>
        /// <returns>Import options that can be used in a following step to read in the ascii stream. Null is returned if the stream contains no data.</returns>
        public AsciiImportOptions Analyze(int nLines, AsciiImportOptions defaultImportOptions)
        {
            string sLine;

            stream.Position = 0;
            System.IO.StreamReader       sr     = new System.IO.StreamReader(stream, System.Text.Encoding.Default, true);
            System.Collections.ArrayList result = new System.Collections.ArrayList();

            for (int i = 0; i < nLines; i++)
            {
                sLine = sr.ReadLine();
                if (null == sLine)
                {
                    break;
                }
                result.Add(new AsciiLineAnalyzer(i, sLine));
            }

            if (result.Count == 0)
            {
                return(null); // there is nothing to analyze
            }
            // now view the results
            // calc the frequency o
            System.Collections.SortedList sl = new System.Collections.SortedList();
            int nItems;

            // first the tabs

            /*
             * sl.Clear();
             * for(int i=0;i<result.Count;i++)
             * {
             * nItems = ((AsciiLineAnalyzer)result[i]).nNumberOfTabs;
             * if(0!=nItems)
             * {
             *  if(null==sl[nItems])
             *    sl.Add(nItems,1);
             *  else
             *    sl[nItems] = 1+(int)sl[nItems];
             * }
             * }
             * // get the tab count with the topmost frequency
             * int nMaxNumberOfSameTabs = 0;
             * int nMaxTabsOfSameNumber = 0;
             * for(int i=0;i<sl.Count;i++)
             * {
             * if(nMaxNumberOfSameTabs<(int)sl.GetByIndex(i))
             * {
             *  nMaxNumberOfSameTabs = (int)sl.GetByIndex(i);
             *  nMaxTabsOfSameNumber = (int)sl.GetKey(i);
             * }
             * }
             */


            // Count the commas
            sl.Clear();
            for (int i = 0; i < result.Count; i++)
            {
                nItems = ((AsciiLineAnalyzer)result[i]).nNumberOfCommas;
                if (0 != nItems)
                {
                    if (null == sl[nItems])
                    {
                        sl.Add(nItems, 1);
                    }
                    else
                    {
                        sl[nItems] = 1 + (int)sl[nItems];
                    }
                }
            }
            // get the comma count with the topmost frequency
            int nMaxNumberOfSameCommas = 0;
            int nMaxCommasOfSameNumber = 0;

            for (int i = 0; i < sl.Count; i++)
            {
                if (nMaxNumberOfSameCommas < (int)sl.GetByIndex(i))
                {
                    nMaxNumberOfSameCommas = (int)sl.GetByIndex(i);
                    nMaxCommasOfSameNumber = (int)sl.GetKey(i);
                }
            }

            // Count the semicolons
            sl.Clear();
            for (int i = 0; i < result.Count; i++)
            {
                nItems = ((AsciiLineAnalyzer)result[i]).nNumberOfSemicolons;
                if (0 != nItems)
                {
                    if (null == sl[nItems])
                    {
                        sl.Add(nItems, 1);
                    }
                    else
                    {
                        sl[nItems] = 1 + (int)sl[nItems];
                    }
                }
            }
            // get the tab count with the topmost frequency
            int nMaxNumberOfSameSemicolons = 0;
            int nMaxSemicolonsOfSameNumber = 0;

            for (int i = 0; i < sl.Count; i++)
            {
                if (nMaxNumberOfSameSemicolons < (int)sl.GetByIndex(i))
                {
                    nMaxNumberOfSameSemicolons = (int)sl.GetByIndex(i);
                    nMaxSemicolonsOfSameNumber = (int)sl.GetKey(i);
                }
            }


            NumberAndStructure[] st = new NumberAndStructure[3];

            for (int i = 0; i < 3; i++)
            {
                st[i].nLines = GetPriorityOf(result, (AsciiLineAnalyzer.Separation)i, ref st[i].structure);
            }

            // look for the top index

            int    nMaxLines      = int.MinValue;
            double maxprtylines   = 0;
            int    nBestSeparator = int.MinValue;

            for (int i = 0; i < 3; i++)
            {
                double prtylines = (double)st[i].nLines * st[i].structure.Priority;
                if (prtylines == maxprtylines)
                {
                    if (st[i].nLines > nMaxLines)
                    {
                        nMaxLines      = st[i].nLines;
                        nBestSeparator = i;
                    }
                }
                else if (prtylines > maxprtylines)
                {
                    maxprtylines   = prtylines;
                    nBestSeparator = i;
                    nMaxLines      = st[i].nLines;
                }
            }

            AsciiImportOptions opt = defaultImportOptions.Clone();

            opt.bDelimited          = true;
            opt.cDelimiter          = nBestSeparator == 0 ? '\t' : (nBestSeparator == 1 ? ',' : ';');
            opt.recognizedStructure = st[nBestSeparator].structure;


            // look how many header lines are in the file by comparing the structure of the first lines  with the recognized structure
            for (int i = 0; i < result.Count; i++)
            {
                opt.nMainHeaderLines = i;
                if (((AsciiLineAnalyzer)result[i]).structure[nBestSeparator].IsCompatibleWith(opt.recognizedStructure))
                {
                    break;
                }
            }


            // calculate the total statistics of decimal separators
            opt.m_DecimalSeparatorCommaCount = 0;
            opt.m_DecimalSeparatorDotCount   = 0;
            for (int i = 0; i < result.Count; i++)
            {
                opt.m_DecimalSeparatorDotCount   += ((AsciiLineAnalyzer)result[i]).structure[nBestSeparator].DecimalSeparatorDotCount;
                opt.m_DecimalSeparatorCommaCount += ((AsciiLineAnalyzer)result[i]).structure[nBestSeparator].DecimalSeparatorCommaCount;
            }



            return(opt);
        }
예제 #2
0
    /// <summary>
    /// Analyzes the first <code>nLines</code> of the ascii stream.
    /// </summary>
    /// <param name="nLines">The number of lines to analyze. It is no error if the stream contains a less number of lines than provided here.</param>
    /// <param name="defaultImportOptions">The default import options.</param>
    /// <returns>Import options that can be used in a following step to read in the ascii stream. Null is returned if the stream contains no data.</returns>
    public AsciiImportOptions Analyze(int nLines, AsciiImportOptions defaultImportOptions)
    {

      string sLine;

      stream.Position = 0;
      System.IO.StreamReader sr = new System.IO.StreamReader(stream,System.Text.Encoding.Default,true);
      System.Collections.ArrayList result = new System.Collections.ArrayList();
    
      for(int i=0;i<nLines;i++)
      {
        sLine = sr.ReadLine();
        if(null==sLine)
          break;
        result.Add(new AsciiLineAnalyzer(i,sLine));
      }
    
      if(result.Count==0)
        return null; // there is nothing to analyze

      // now view the results
      // calc the frequency o
      System.Collections.SortedList sl= new System.Collections.SortedList();
      int nItems;
      // first the tabs

      /*
      sl.Clear();
      for(int i=0;i<result.Count;i++)
      {
        nItems = ((AsciiLineAnalyzer)result[i]).nNumberOfTabs;
        if(0!=nItems)
        {
          if(null==sl[nItems])
            sl.Add(nItems,1);
          else 
            sl[nItems] = 1+(int)sl[nItems];
        }
      }
      // get the tab count with the topmost frequency
      int nMaxNumberOfSameTabs = 0;
      int nMaxTabsOfSameNumber = 0;
      for(int i=0;i<sl.Count;i++)
      {
        if(nMaxNumberOfSameTabs<(int)sl.GetByIndex(i))
        {
          nMaxNumberOfSameTabs = (int)sl.GetByIndex(i);
          nMaxTabsOfSameNumber = (int)sl.GetKey(i);
        }
      }
*/
      
      
      // Count the commas
      sl.Clear();
      for(int i=0;i<result.Count;i++)
      {
        nItems = ((AsciiLineAnalyzer)result[i]).nNumberOfCommas;
        if(0!=nItems)
        {
          if(null==sl[nItems])
            sl.Add(nItems,1);
          else 
            sl[nItems] = 1+(int)sl[nItems];
        }
      }
      // get the comma count with the topmost frequency
      int nMaxNumberOfSameCommas = 0;
      int nMaxCommasOfSameNumber = 0;
      for(int i=0;i<sl.Count;i++)
      {
        if(nMaxNumberOfSameCommas<(int)sl.GetByIndex(i))
        {
          nMaxNumberOfSameCommas = (int)sl.GetByIndex(i);
          nMaxCommasOfSameNumber = (int)sl.GetKey(i);
        }
      }

      // Count the semicolons
      sl.Clear();
      for(int i=0;i<result.Count;i++)
      {
        nItems = ((AsciiLineAnalyzer)result[i]).nNumberOfSemicolons;
        if(0!=nItems)
        {
          if(null==sl[nItems])
            sl.Add(nItems,1);
          else 
            sl[nItems] = 1+(int)sl[nItems];
        }
      }
      // get the tab count with the topmost frequency
      int nMaxNumberOfSameSemicolons = 0;
      int nMaxSemicolonsOfSameNumber = 0;
      for(int i=0;i<sl.Count;i++)
      {
        if(nMaxNumberOfSameSemicolons<(int)sl.GetByIndex(i))
        {
          nMaxNumberOfSameSemicolons = (int)sl.GetByIndex(i);
          nMaxSemicolonsOfSameNumber = (int)sl.GetKey(i);
        }
      }

    
      NumberAndStructure[] st = new NumberAndStructure[3];

      for(int i=0;i<3;i++)
      {
        st[i].nLines = GetPriorityOf(result,(AsciiLineAnalyzer.Separation)i,ref st[i].structure);
      }

      // look for the top index
    
      int nMaxLines = int.MinValue;
      double maxprtylines=0;
      int nBestSeparator = int.MinValue;
      for(int i=0;i<3;i++)
      {
        double prtylines = (double)st[i].nLines * st[i].structure.Priority;
        if(prtylines==maxprtylines)
        {
          if(st[i].nLines > nMaxLines)
          {
            nMaxLines = st[i].nLines;
            nBestSeparator = i;
          }
        }
        else if(prtylines>maxprtylines)
        {
          maxprtylines = prtylines;
          nBestSeparator = i;
          nMaxLines=st[i].nLines;
        }
      }

      AsciiImportOptions opt = defaultImportOptions.Clone();
      
      opt.bDelimited = true;
      opt.cDelimiter = nBestSeparator==0 ? '\t' : (nBestSeparator==1 ? ',' : ';');
      opt.recognizedStructure = st[nBestSeparator].structure;


      // look how many header lines are in the file by comparing the structure of the first lines  with the recognized structure
      for(int i=0;i<result.Count;i++)
      {
        opt.nMainHeaderLines=i;
        if(((AsciiLineAnalyzer)result[i]).structure[nBestSeparator].IsCompatibleWith(opt.recognizedStructure))
          break;
      }


      // calculate the total statistics of decimal separators
      opt.m_DecimalSeparatorCommaCount=0;
      opt.m_DecimalSeparatorDotCount=0;
      for(int i=0;i<result.Count;i++)
      {
        opt.m_DecimalSeparatorDotCount += ((AsciiLineAnalyzer)result[i]).structure[nBestSeparator].DecimalSeparatorDotCount;
        opt.m_DecimalSeparatorCommaCount += ((AsciiLineAnalyzer)result[i]).structure[nBestSeparator].DecimalSeparatorCommaCount;
      }



      return opt;

    }