예제 #1
0
        /*
         * reads one line from the given stream and splits it according to the
         *  given separator
         * or returns null on EOF
         *
         * throws exception on parse error
         */
        public static string[] ReadLine(StreamLineNum sr, char[] sep, string empty,
                                        int field, out Int64 id)
        {
            string[] r = null;
            id = 0;
            do
            {
                string line = sr.ReadLine();
                if (line == null)
                {
                    return(null);
                }
                if (sep != null)
                {
                    r = line.Split(sep);
                    if (empty != null)
                    {
                        for (int i = 0; i < r.Length; i++)
                        {
                            if (r[i].Length == 0)
                            {
                                r[i] = empty;
                            }
                        }
                    }
                }
                else
                {
                    r = line.Split(sep, StringSplitOptions.RemoveEmptyEntries);
                }
            } while(r.Length == 0);

            if (r.Length < field)
            {
                string err = "Invalid data in input file " + sr.Fn + ", line " +
                             sr.Line.ToString() + ": too few fields ( expected at least "
                             + field.ToString() + ", found only " + r.Length + ")!\n";
                Console.Error.WriteLine("{0}", err);
                throw new Exception(err);
            }
            if (field == 0)
            {
                return(r);
            }
            if (!Int64.TryParse(r[field - 1], out id))
            {
                string err = "Invalid data in input file " + sr.Fn + ", line " +
                             sr.Line.ToString() + ": could not parse " + r[field] +
                             " as number!\n";
                Console.Error.WriteLine("{0}", err);
                throw new Exception(err);
            }

            return(r);
        }
예제 #2
0
 /*
  * read the next set of lines from the sr
  *
  * input:
  *   sr -- stream to read
  *   nextline -- next line in the stream (if already read)
  *   nextid -- next id in the stream (if already read)
  *   field -- field containing the ID
  *   req_fields -- minimum required number of fields
  *   sep -- separator characters to use
  *   empty -- replace empty fields with this string
  *
  * output:
  *   id -- current ID
  *   lines -- collection of one or more lines with the current ID,
  *       or empty list if the file is empty
  *   nextline -- next line in the file (if exists, null if EOF)
  *   nextid -- next ID in the file (if exists, unchanged otherwise)
  *
  * throws exception on format error
  */
 public static void ReadNext(StreamLineNum sr, List <string[]> lines,
                             ref Int64 id, ref Int64 nextid, ref string[] nextline, int field,
                             int req_fields, char[] sep, string empty)
 {
     lines.Clear();
     if (nextline == null)
     {
         nextline = ReadLine(sr, sep, empty, field, out nextid);
         if (nextline == null)
         {
             return;                                  // empty file or end of file
         }
         if (nextline.Length < req_fields)
         {
             string err = "Invalid data in input file " + sr.Fn +
                          ", line " + sr.Line.ToString() +
                          ": too few fields ( expected at least " +
                          req_fields.ToString() + ", found only " +
                          nextline.Length + ")!\n";
             Console.Error.WriteLine("{0}", err);
             throw new Exception(err);
         }
     }
     id = nextid;
     lines.Add(nextline);
     // read further lines, until we have the same ID in them
     while (true)
     {
         nextline = ReadLine(sr, sep, empty, field, out nextid);
         if (nextline == null)
         {
             return;                                  // end of file
         }
         if (nextline.Length < req_fields)
         {
             string err = "Invalid data in input file " + sr.Fn +
                          ", line " + sr.Line.ToString() +
                          ": too few fields ( expected at least " +
                          req_fields.ToString() + ", found only " +
                          nextline.Length + ")!\n";
             Console.Error.WriteLine("{0}", err);
             throw new Exception(err);
         }
         if (nextid != id)
         {
             break;
         }
         lines.Add(nextline);
     }
 }
예제 #3
0
        public static void Main(string[] args)
        {
            string file1 = null;
            string file2 = null;

            int field1      = 1;
            int field2      = 1;
            int req_fields1 = 1;
            int req_fields2 = 1;

            List <int> outfields1 = null;
            List <int> outfields2 = null;

            char[] sep   = (char[])null;
            string empty = null;

            int  unpaired      = 0;       // if 1 or 2, print unpaired lines from the given file
            bool only_unpaired = false;
            bool header        = false;
            bool strict_order  = false;

            // process option arguments
            int i = 0;

            for (; i < args.Length; i++)
            {
                if (args[i].Length > 1 && args[i][0] == '-')
                {
                    switch (args[i][1])
                    {
                    case '1':
                        field1 = Int32.Parse(args[i + 1]);
                        i++;
                        break;

                    case '2':
                        field2 = Int32.Parse(args[i + 1]);
                        i++;
                        break;

                    case 'j':
                        field1 = Int32.Parse(args[i + 1]);
                        field2 = field1;
                        i++;
                        break;

                    case 't':
                        sep = new char[1] {
                            args[i + 1][0]
                        };
                        i++;
                        break;

                    case 'e':
                        empty = args[i + 1];
                        i++;
                        break;

                    case 'a':
                        unpaired = Int32.Parse(args[i + 1]);
                        if (!(unpaired == 1 || unpaired == 2))
                        {
                            Console.Error.WriteLine("-a parameter has to be either 1 or 2\n  use numjoin -h for help\n"); return;
                        }
                        i++;
                        break;

                    case 'v':
                        unpaired = Int32.Parse(args[i + 1]);
                        if (!(unpaired == 1 || unpaired == 2))
                        {
                            Console.Error.WriteLine("-a parameter has to be either 1 or 2\n  use numjoin -h for help\n"); return;
                        }
                        i++;
                        only_unpaired = true;
                        break;

                    case 'o':
                        if (args[i].Length < 3 || !(args[i][2] == '1' || args[i][2] == '2'))
                        {
                            Console.Error.WriteLine("Invalid parameter: {0}\n  (use -o1 or -o2)\n  use numjoin -h for help\n", args[i]); return;
                        }
                        {
                            List <int> tmp   = new List <int>();
                            bool       valid = true;
                            int        max   = 0;
                            // it is valid to give zero output columns from one of the files
                            // (e.g. to filter the other file)
                            // this case it might be necessary to give an empty string
                            // as the argument (i.e. -o1 "")
                            if (!(args[i + 1].Length == 0 || args[i + 1][0] == '-'))
                            {
                                string[] stmp = args[i + 1].Split(',');
                                if (stmp.Length == 0)
                                {
                                    valid = false;
                                }
                                foreach (string s in stmp)
                                {
                                    int x;
                                    if (Int32.TryParse(s, out x))
                                    {
                                        if (x < 1)
                                        {
                                            valid = false; break;
                                        }
                                        tmp.Add(x);
                                        if (x > max)
                                        {
                                            max = x;
                                        }
                                    }
                                    else
                                    {
                                        valid = false; break;
                                    }
                                }
                            }
                            if (!valid)
                            {
                                Console.Error.WriteLine("Invalid parameter: {0} {1}\n  use numjoin -h for help\n", args[i], args[i + 1]); return;
                            }
                            if (args[i][2] == '1')
                            {
                                outfields1 = tmp;
                                if (max > req_fields1)
                                {
                                    req_fields1 = max;
                                }
                            }
                            if (args[i][2] == '2')
                            {
                                outfields2 = tmp;
                                if (max > req_fields2)
                                {
                                    req_fields2 = max;
                                }
                            }
                        }
                        i++;
                        break;

                    case 'H':
                        header = true;
                        break;

                    case 'c':
                        strict_order = true;
                        break;

                    case 'h':
                        Console.Write("{0}", usage);
                        return;

                    default:
                        Console.Error.WriteLine("Unknown parameter: {0}\n  use numjoin -h for help\n");
                        return;
                    }
                }
                else
                {
                    break;          // non-option argument, means the filenames
                }
            }
            // i now points to the first filename
            if (i + 1 >= args.Length)
            {
                Console.Error.WriteLine("Error: expecting two input filenames\n  use numjoin -h for help\n"); return;
            }
            file1 = args[i];
            file2 = args[i + 1];
            if (file1 == file2)
            {
                Console.Error.WriteLine("Error: input files have to be different!\n"); return;
            }

            if (field1 < 1 || field2 < 1)
            {
                Console.Error.WriteLine("Error: field numbers have to be >= 1!\n"); return;
            }

            StreamWriter sw  = new StreamWriter(Console.OpenStandardOutput());
            StreamReader sr1 = null;
            StreamReader sr2 = null;

            Int64 id1 = Int64.MinValue;
            Int64 id2 = Int64.MinValue;

            List <string[]> lines1 = new List <string[]>();
            string[] next1         = null;
            Int64 nextid1          = Int64.MinValue;

            List <string[]> lines2 = new List <string[]>();
            string[] next2         = null;
            Int64 nextid2          = Int64.MinValue;

            // open input files
            if (file1 == "-")
            {
                sr1 = new StreamReader(Console.OpenStandardInput());
            }
            else
            {
                sr1 = new StreamReader(file1);
            }
            if (file2 == "-")
            {
                sr2 = new StreamReader(Console.OpenStandardInput());
            }
            else
            {
                sr2 = new StreamReader(file2);
            }
            StreamLineNum s1 = new StreamLineNum(sr1, file1);
            StreamLineNum s2 = new StreamLineNum(sr2, file2);

            char out_sep = '\t';
            if (sep != null)
            {
                out_sep = sep[0];
            }

            if (header)
            {
                // read and write output header
                Int64    tmp;
                string[] h1 = ReadLine(s1, sep, empty, 0, out tmp);
                string[] h2 = ReadLine(s2, sep, empty, 0, out tmp);
                if (h1 != null && h2 != null)
                {
                    if (h1.Length < req_fields1)
                    {
                        Console.Error.WriteLine("Header too short in file 1!\n"); return;
                    }
                    if (h2.Length < req_fields2)
                    {
                        Console.Error.WriteLine("Header too short in file 2!\n"); return;
                    }

                    bool firstout = true;
                    WriteFields(sw, h1, outfields1, ref firstout, out_sep);
                    WriteFields(sw, h2, outfields2, ref firstout, out_sep);
                }
            }

            // read first lines
            ReadNext(s1, lines1, ref id1, ref nextid1, ref next1, field1, req_fields1, sep, empty);
            ReadNext(s2, lines2, ref id2, ref nextid2, ref next2, field2, req_fields2, sep, empty);

            UInt64 out_lines = 0;
            UInt64 matched1  = 0;
            UInt64 matched2  = 0;
            UInt64 unmatched = 0;
            while (true)
            {
                if (lines1.Count == 0 && lines2.Count == 0)
                {
                    break;                                                        // end of both files
                }
                if (lines1.Count == 0 && unpaired != 2)
                {
                    break;
                }
                if (lines2.Count == 0 && unpaired != 1)
                {
                    break;
                }
                if (lines1.Count > 0 && lines2.Count > 0 && id1 == id2)
                {
                    // match, write out (if needed -- not only_unpaired)
                    // there could be several lines from both files, iterate
                    // over the cross product
                    if (!only_unpaired)
                    {
                        matched1 += (UInt64)lines1.Count;
                        matched2 += (UInt64)lines2.Count;
                        foreach (string[] line1 in lines1)
                        {
                            foreach (string[] line2 in lines2)
                            {
                                bool firstout = true;

                                // write out fields from the first file
                                WriteFields(sw, line1, outfields1, ref firstout, out_sep);
                                WriteFields(sw, line2, outfields2, ref firstout, out_sep);
                                sw.Write('\n');
                                out_lines++;
                            }
                        }
                    }

                    if (strict_order)
                    {
                        // check order
                        if (next1 != null && nextid1 < id1)
                        {
                            string err = "Error: input file " + s1.Fn +
                                         " not sorted on line " + s1.Line + " ( " +
                                         nextid1.ToString() + " < " + id1.ToString() + ")!\n";
                            Console.Error.WriteLine("{0}", err);
                            break;
                        }
                        if (next2 != null && nextid2 < id2)
                        {
                            string err = "Error: input file " + s2.Fn +
                                         " not sorted on line " + s2.Line + " ( " +
                                         nextid2.ToString() + " < " + id2.ToString() + ")!\n";
                            Console.Error.WriteLine("{0}", err);
                            break;
                        }
                    }

                    // read next lines
                    ReadNext(s1, lines1, ref id1, ref nextid1, ref next1, field1, req_fields1, sep, empty);
                    ReadNext(s2, lines2, ref id2, ref nextid2, ref next2, field2, req_fields2, sep, empty);
                    continue;     // skip following section, the next lines might be a match as well
                }                 // write out one match

                // no match
                if (lines1.Count > 0 && (id1 < id2 || lines2.Count == 0))
                {
                    // need to advance file1

                    // check if lines from file 1 should be output if not matched
                    if (unpaired == 1)
                    {
                        foreach (string[] line1 in lines1)
                        {
                            // still print unpaired lines from file 1
                            bool firstout = true;
                            WriteFields(sw, line1, outfields1, ref firstout, out_sep);
                            // note: we write empty fields for file 2
                            if (outfields2 != null)
                            {
                                WriteFields(sw, null, outfields2, ref firstout, out_sep);
                            }
                            sw.Write('\n');
                            out_lines++;
                            unmatched++;
                        }
                    }

                    // first check sort order, that could be a problem here
                    if (next1 != null && nextid1 < id1)
                    {
                        string err = "Error: input file " + s1.Fn +
                                     " not sorted on line " + s1.Line + " ( " +
                                     nextid1.ToString() + " < " + id1.ToString() + ")!\n";
                        Console.Error.WriteLine("{0}", err);
                        break;
                    }

                    ReadNext(s1, lines1, ref id1, ref nextid1, ref next1, field1, req_fields1, sep, empty);
                }
                else
                {
                    // here id2 < id1 or lines1.Count == 0 and lines2.Count > 0
                    // check if lines from file 2 should be output if not matched
                    if (unpaired == 2)
                    {
                        foreach (string[] line2 in lines2)
                        {
                            // still print unpaired lines from file 2
                            bool firstout = true;
                            // note: we write empty fields for file 1
                            if (outfields1 != null)
                            {
                                WriteFields(sw, null, outfields1, ref firstout, out_sep);
                            }
                            WriteFields(sw, line2, outfields2, ref firstout, out_sep);
                            sw.Write('\n');
                            out_lines++;
                            unmatched++;
                        }
                    }

                    // first check sort order, that could be a problem here
                    if (next2 != null && nextid2 < id2)
                    {
                        string err = "Error: input file " + s2.Fn +
                                     " not sorted on line " + s2.Line + " ( " +
                                     nextid2.ToString() + " < " + id2.ToString() + ")!\n";
                        Console.Error.WriteLine("{0}", err);
                        break;
                    }

                    ReadNext(s2, lines2, ref id2, ref nextid2, ref next2, field2, req_fields2, sep, empty);
                }
            }             // main loop

            sr1.Close();
            sr2.Close();
            sw.Close();             // flush output

            Console.Error.WriteLine("Matched lines from file 1: {0}", matched1);
            Console.Error.WriteLine("Matched lines from file 2: {0}", matched2);
            if (unmatched > 0)
            {
                switch (unpaired)
                {
                case 1:
                    Console.Error.WriteLine("Unmatched lines from file 1: {0}", unmatched);
                    break;

                case 2:
                    Console.Error.WriteLine("Unmatched lines from file 2: {0}", unmatched);
                    break;
                }
            }
            Console.Error.WriteLine("Total lines output: {0}", out_lines);
        }